diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8d06c75 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,98 @@ +cmake_minimum_required(VERSION 3.16) +project(vllm_ascend_C) + +# include(CheckCXXcompilerFlag) +# check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17) +set(CMAKE_CXX_STANDARD 17) + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + +# Suppress potential warnings about unused manually-specified variables +set(ignoreMe "${VLLM_PYTHON_PATH}") + +# TODO: Add 3.12 back when torch-npu support 3.12 +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11") + +find_package(pybind11 REQUIRED) + +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") +set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") + +find_package(Torch REQUIRED) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION ${SOC_VERSION}) +message(STATUS "Detected SOC version: ${SOC_VERSION}") + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE) +endif() + +if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()") +endif() + +set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH}) +if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.") +endif() + +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) +file(GLOB KERNEL_FILES +${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp) + +ascendc_library(vllm_ascend_kernels SHARED + ${KERNEL_FILES} +) + +message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") + +file(GLOB VLLM_ASCEND_SRC +${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp) + +include_directories( + ${pybind11_INCLUDE_DIRS} + ${PYTHON_INCLUDE_PATH} + ${TORCH_INCLUDE_DIRS} + ${TORCH_NPU_PATH}/include + ${ASCEND_HOME_PATH}/include + ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform + ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform +) + +set( + INCLUDES + ${TORCH_INCLUDE_DIRS} + ${TORCH_NPU_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include + ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform +) + +pybind11_add_module(vllm_ascend_C ${VLLM_ASCEND_SRC}) + +target_link_directories( + vllm_ascend_C + PRIVATE + ${TORCH_NPU_PATH}/lib/ + ${ASCEND_HOME_PATH}/lib64 +) + +target_link_libraries( + vllm_ascend_C + PUBLIC + ${TORCH_LIBRARIES} + libtorch_npu.so + vllm_ascend_kernels + ascendcl + platform +) + +target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib") + +install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH}) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..acdb2f7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,127 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a87fa14 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,3 @@ +# Contributing to vLLM Ascend + +You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally. diff --git a/DCO b/DCO new file mode 100644 index 0000000..49b8cb0 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..29d6445 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.310p b/Dockerfile.310p new file mode 100644 index 0000000..4eb3c63 --- /dev/null +++ b/Dockerfile.310p @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler new file mode 100644 index 0000000..a9d7b34 --- /dev/null +++ b/Dockerfile.310p.openEuler @@ -0,0 +1,59 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.a3 b/Dockerfile.a3 new file mode 100644 index 0000000..8bdfb0e --- /dev/null +++ b/Dockerfile.a3 @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler new file mode 100644 index 0000000..aff585b --- /dev/null +++ b/Dockerfile.a3.openEuler @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler new file mode 100644 index 0000000..47a0c60 --- /dev/null +++ b/Dockerfile.openEuler @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.en.md b/README.en.md new file mode 100644 index 0000000..72ed323 --- /dev/null +++ b/README.en.md @@ -0,0 +1,91 @@ +

+ + + vllm-ascend + +

+ +

+vLLM Ascend Plugin +

+ +

+| About Ascend | Documentation | #sig-ascend | Users Forum | Weekly Meeting | +

+ +

+English | 中文 +

+ +--- +*Latest News* 🔥 +- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend. +- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). +- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios. +- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors. +- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html). +- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). +- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU. +- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## Overview + +vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU. + +It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM. + +By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU. + +## Prerequisites + +- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental) +- OS: Linux +- Software: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (the same version as vllm-ascend) + +## Getting Started + +Please use the following recommended versions to get started quickly: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details| +|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details| + +## Contributing +See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test. + +We welcome and value any contributions and collaborations: +- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues) +- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help. + +## Branch + +vllm-ascend has main branch and dev branch. + +- **main**: main branch,corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI. +- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version. + +Below is maintained branches: + +| Branch | Status | Note | +|------------|--------------|--------------------------------------| +| main | Maintained | CI commitment for vLLM main branch and vLLM 0.10.x branch | +| v0.7.1-dev | Unmaintained | Only doc fixed is allowed | +| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version | +| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration | + +Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details. + +## Weekly Meeting + +- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting +- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15)) + +## License + +Apache License 2.0, as found in the [LICENSE](./LICENSE) file. diff --git a/README.md b/README.md index c00eb9e..8c78bcf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,34 @@ # enginex-ascend-910-vllm -运行于【昇腾-910】系列算力卡的【文本生成】引擎,基于 vLLM 引擎进行架构特别适配优化,支持 Qwen、DeepSeek、Llama 等最新开源模型 \ No newline at end of file +运行于【昇腾-910】系列算力卡的【文本生成】引擎,基于 vLLM 引擎进行架构特别适配优化,支持 Qwen、DeepSeek、Llama 等最新开源模型 + +## 镜像 + +Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.10.0rc1 + +## 总览 + +vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。 + +此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则:通过解耦的方式提供了vLLM对Ascend NPU的支持。 + +使用 vLLM 昇腾插件,可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。 + +## 准备 + +- 硬件:Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo(实验性支持) +- 操作系统:Linux +- 软件: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (与vllm-ascend版本一致) + +## 开始使用 + +推荐您使用以下版本快速开始使用: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多| +|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多| \ No newline at end of file diff --git a/README.zh.md b/README.zh.md new file mode 100644 index 0000000..d7f1310 --- /dev/null +++ b/README.zh.md @@ -0,0 +1,90 @@ +

+ + + vllm-ascend + +

+ +

+vLLM Ascend Plugin +

+ +

+| 关于昇腾 | 官方文档 | #sig-ascend | 用户论坛 | 社区例会 | +

+ +

+English | 中文 +

+ +--- +*最新消息* 🔥 + +- [2025/09] 我们发布了新的正式版本 [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! 请按照[官方指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html)开始在Ascend上部署大型专家并行 (EP)。 +- [2025/08] 我们与vLLM和腾讯合作举办了[vLLM北京Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q),!请在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料。 +- [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线!展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例,展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。 +- [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线!所有的贡献都值得被记录,感谢所有的贡献者。 +- [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)!我们与 vLLM 社区合作发布了一篇博客文章,分享了我们的实践:[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。 +- [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料. +- [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库,让vLLM可以无缝运行在Ascend NPU。 +- [2024/12] 我们正在与 vLLM 社区合作,以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## 总览 + +vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。 + +此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则:通过解耦的方式提供了vLLM对Ascend NPU的支持。 + +使用 vLLM 昇腾插件,可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。 + +## 准备 + +- 硬件:Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo(实验性支持) +- 操作系统:Linux +- 软件: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (与vllm-ascend版本一致) + +## 开始使用 + +推荐您使用以下版本快速开始使用: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多| +|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多| + +## 贡献 +请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。 + +我们欢迎并重视任何形式的贡献与合作: +- 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。 +- 请通过[用户论坛](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support)来交流使用问题和寻求帮助。 + +## 分支策略 +vllm-ascend有主干分支和开发分支。 + +- **main**: 主干分支,与vLLM的主干分支对应,并通过昇腾CI持续进行质量看护。 +- **vX.Y.Z-dev**: 开发分支,随vLLM部分新版本发布而创建,比如`v0.7.3-dev`是vllm-asend针对vLLM `v0.7.3`版本的开发分支。 + +下面是维护中的分支: + +| 分支 | 状态 | 备注 | +|------------|------------|---------------------| +| main | Maintained | 基于vLLM main分支CI看护 | +| v0.7.1-dev | Unmaintained | 只允许文档修复 | +| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复,不会再发布新版本 | +| v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 | +|rfc/feature-name| Maintained | 为协作创建的[特性分支](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) | + +请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html)了解更多详细信息。 + +## 社区例会 + +- vLLM Ascend 每周社区例会: https://tinyurl.com/vllm-ascend-meeting +- 每周三下午,15:00 - 16:00 (UTC+8, [查看您的时区](https://dateful.com/convert/gmt8?t=15)) + +## 许可证 +Apache 许可证 2.0,如 [LICENSE](./LICENSE) 文件中所示。 diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..64a55cc --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,175 @@ +# Introduction +This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance. + +# Overview +**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon). +- Latency tests + - Input length: 32 tokens. + - Output length: 128 tokens. + - Batch size: fixed (8). + - Models: Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: end-to-end latency (mean, median, p99). + +- Throughput tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm to achieve maximum throughput. + - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: throughput. +- Serving tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm and the arrival pattern of the requests. + - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). + - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +**Benchmarking Duration**: about 800 senond for single model. + +# Quick Use +## Prerequisites +Before running the benchmarks, ensure the following: + +- vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices. + +- Install necessary dependencies for benchmarks: + + ```shell + pip install -r benchmarks/requirements-bench.txt + ``` + +- For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time. +- If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example: + + ```shell + [ + { + "test_name": "serving_qwen2_5vl_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "trust_remote_code": "", + "max_model_len": 16384 + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "backend": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + } + ] + ``` + +this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks). + + - **Test Overview** + - Test Name: serving_qwen2_5vl_7B_tp1 + + - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing). + + - Server Parameters + - Model: Qwen/Qwen2.5-VL-7B-Instruct + + - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node) + + - Swap Space: 16 GB (used to handle memory overflow by swapping to disk) + + - disable_log_stats: disables logging of performance statistics. + + - disable_log_requests: disables logging of individual requests. + + - Trust Remote Code: enabled (allows execution of model-specific custom code) + + - Max Model Length: 16,384 tokens (maximum context length supported by the model) + + - Client Parameters + + - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server) + + - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format) + + - Dataset Source: Hugging Face (hf) + + - Dataset Split: train + + - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent) + + - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face) + + - Number of Prompts: 200 (the total number of prompts used during the test) + +## Run benchmarks + +### Use benchmark script +The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory: + +```shell +bash benchmarks/scripts/run-performance-benchmarks.sh +``` + +Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following: + +```shell +. +|-- serving_qwen2_5_7B_tp1_qps_1.json +|-- serving_qwen2_5_7B_tp1_qps_16.json +|-- serving_qwen2_5_7B_tp1_qps_4.json +|-- serving_qwen2_5_7B_tp1_qps_inf.json +|-- latency_qwen2_5_7B_tp1.json +|-- throughput_qwen2_5_7B_tp1.json +``` + +These files contain detailed benchmarking results for further analysis. + +### Use benchmark cli + +For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks +Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example: +#### Online serving +1. Launch the server: + + ```shell + vllm serve Qwen2.5-VL-7B-Instruct --max-model-len 16789 + ``` + +2. Running performance tests using cli + + ```shell + vllm bench serve --model Qwen2.5-VL-7B-Instruct\ + --endpoint-type "openai-chat" --dataset-name hf \ + --hf-split train --endpoint "/v1/chat/completions" \ + --dataset-path "lmarena-ai/vision-arena-bench-v0.1" \ + --num-prompts 200 \ + --request-rate 16 + ``` + +#### Offline +- **Throughput** + + ```shell + vllm bench throughput --output-json results/throughput_qwen2_5_7B_tp1.json \ + --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 --load-format dummy \ + --dataset-path /github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 200 --backend vllm + ``` + +- **Latency** + + ```shell + vllm bench latency --output-json results/latency_qwen2_5_7B_tp1.json \ + --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 \ + --load-format dummy --num-iters-warmup 5 --num-iters 15 + ``` diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py new file mode 100644 index 0000000..b3ef7ec --- /dev/null +++ b/benchmarks/ops/ben_vocabparallelembedding.py @@ -0,0 +1,158 @@ +from typing import Tuple + +import numpy as np +import pytest +import torch +import torch_npu # noqa: F401 +import vllm # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + + +def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50): + """ + Benchmark function for NPU operations + + Args: + fn: Function to benchmark + num_iterations: Number of timing iterations + num_warmup_iterations: Number of warmup iterations + + Returns: + float: Minimum elapsed time in seconds + """ + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + times = np.zeros(num_iterations + num_warmup_iterations) + + # Run iterations + for i in range(num_warmup_iterations + num_iterations): + with torch.no_grad(): + start.record() + fn() # Execute the function + end.record() + torch.npu.synchronize() + times[i] = start.elapsed_time(end) + + # Remove warmup iterations and convert to seconds + times = times[num_warmup_iterations:] + elapsed_time = np.amin(times) / 1000 + return elapsed_time + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index + ) + added_offset = ( + added_vocab_start_index + - (org_vocab_end_index - org_vocab_start_index) + - num_org_vocab_padding + ) + valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + added_offset * added_vocab_mask + ) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +DTYPES = [torch.int32] +SHAPES = [(3, 4, 5)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed and device + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Define reference function + def ref_fn(): + return get_masked_input_and_mask_ref( + input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"], + ) + + # Define custom function + def custom_fn(): + return torch.ops._C.get_masked_input_and_mask( + input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"], + ) + + # Get results for correctness testing + ref_masked_input, ref_mask = ref_fn() + custom_masked_input, custom_mask = custom_fn() + + # Benchmark both implementations + ref_time = benchmark_npu(ref_fn) + custom_time = benchmark_npu(custom_fn) + + # Print performance results + print("\nPerformance Results:") + print(f"Reference implementation: {ref_time * 1000:.3f} ms") + print(f"Custom implementation: {custom_time * 1000:.3f} ms") + print(f"Speedup: {ref_time / custom_time:.2f}x") + + # Compare results for correctness + ref_masked_input = ref_masked_input.to(dtype) + print("\nResults comparison:") + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}", + ) + torch.testing.assert_close( + custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}", + ) diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt new file mode 100644 index 0000000..2290823 --- /dev/null +++ b/benchmarks/requirements-bench.txt @@ -0,0 +1,4 @@ +pandas +datasets +modelscope +tabulate \ No newline at end of file diff --git a/benchmarks/scripts/convert_json_to_markdown.py b/benchmarks/scripts/convert_json_to_markdown.py new file mode 100644 index 0000000..1120434 --- /dev/null +++ b/benchmarks/scripts/convert_json_to_markdown.py @@ -0,0 +1,188 @@ +import argparse +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +CUR_PATH = Path(__file__).parent.resolve() +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "avg_latency": "Mean latency (ms)", + "P50": "Median latency (ms)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "num_requests": "Num of reqs", + "total_num_tokens": "Total num of tokens", + "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "request_rate": "Request rate (req/s)", + "request_throughput": "Tput (req/s)", + "output_throughput": "Output Tput (tok/s)", + "median_ttft_ms": "TTFT (ms)", + "median_tpot_ms": "TPOT (ms)", + "median_itl_ms": "ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file) as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps( + { + "latency": latency.to_dict(), + "throughput": throughput.to_dict(), + "serving": serving.to_dict(), + } + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Process the results of the benchmark tests." + ) + parser.add_argument( + "--results_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.", + ) + parser.add_argument( + "--output_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.", + ) + parser.add_argument( + "--markdown_template", + type=str, + default="./perf_result_template.md", + help="The template file for the markdown report.", + ) + parser.add_argument( + "--tag", default="main", help="Tag to be used for release message." + ) + parser.add_argument( + "--commit_id", default="", help="Commit ID to be used for release message." + ) + + args = parser.parse_args() + results_folder = (CUR_PATH / args.results_folder).resolve() + output_folder = (CUR_PATH / args.output_folder).resolve() + markdown_template = (CUR_PATH / args.markdown_template).resolve() + + # collect results + for test_file in results_folder.glob("*.json"): + with open(test_file) as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} + ) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + serving_results.sort(key=lambda x: (len(x["test_name"]), x["test_name"])) + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list(latency_column_mapping.keys())].rename( + columns=latency_column_mapping + ) + if not serving_results.empty: + serving_results = serving_results[list(serving_column_mapping.keys())].rename( + columns=serving_column_mapping + ) + if not throughput_results.empty: + throughput_results = throughput_results[ + list(throughput_results_column_mapping.keys()) + ].rename(columns=throughput_results_column_mapping) + + processed_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) + + # get markdown tables + latency_md_table = tabulate( + latency_results, headers="keys", tablefmt="pipe", showindex=False + ) + serving_md_table = tabulate( + serving_results, headers="keys", tablefmt="pipe", showindex=False + ) + throughput_md_table = tabulate( + throughput_results, headers="keys", tablefmt="pipe", showindex=False + ) + + # document the result + print(output_folder) + with open(output_folder / "benchmark_results.md", "w") as f: + results = read_markdown(markdown_template) + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json, + ) + f.write(results) diff --git a/benchmarks/scripts/perf_result_template.md b/benchmarks/scripts/perf_result_template.md new file mode 100644 index 0000000..cb6a2e6 --- /dev/null +++ b/benchmarks/scripts/perf_result_template.md @@ -0,0 +1,31 @@ +## Online serving tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput, TTFT (median time to the first token ), ITL (median inter-token latency) TPOT(median time per output token). + +{serving_tests_markdown_table} + +## Offline tests +### Latency tests + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: end-to-end latency. + +{latency_tests_markdown_table} + +### Throughput tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput. + +{throughput_tests_markdown_table} diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh new file mode 100644 index 0000000..b604fe9 --- /dev/null +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -0,0 +1,321 @@ +#!/bin/bash +set -e + +check_npus() { + # shellcheck disable=SC2155 + declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ') + + if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then + echo "Need at least 1 NPU to run benchmarking." + exit 1 + else + echo "found NPU conut: $npu_count" + fi + + npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}') + + echo "NPU type is: $npu_type" +} + +ensure_sharegpt_downloaded() { + local FILE="/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json" + local DIR + DIR=$(dirname "$FILE") + + if [ ! -f "$FILE" ]; then + echo "$FILE not found, downloading from hf-mirror ..." + mkdir -p "$DIR" + wget -O "$FILE" https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + if [ $? -ne 0 ]; then + echo "Download failed!" >&2 + return 1 + fi + echo "Download completed and saved to $FILE" + else + echo "$FILE already exists." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args + args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + local waited=0 + local timeout_sec=1200 + + while (( waited < timeout_sec )); do + if curl -s -X GET localhost:8000/health > /dev/null; then + return 0 + fi + echo "Waiting for vllm server to start..." + sleep 1 + ((waited++)) + done + + echo "Timeout waiting for server" + return 1 +} + +get_cur_npu_id() { + npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}' +} + +kill_npu_processes() { + ps -aux + lsof -t -i:8000 | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + + sleep 4 + rm -rf ~/.config/vllm + +} + +update_json_field() { + local json_file="$1" + local field_name="$2" + local field_value="$3" + + jq --arg value "$field_value" \ + --arg key "$field_name" \ + '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \ + mv "${json_file}.tmp" "$json_file" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + latency_command="vllm bench latency \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # run the benchmark + eval "$latency_command" + # echo model_name to result file + model_name=$(echo "$latency_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" + kill_npu_processes + + done +} + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + throughput_command="vllm bench throughput \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + + # run the benchmark + eval "$throughput_command" + # echo model_name to result file + model_name=$(echo "$throughput_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" + kill_npu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + bash -c "$server_command" & + server_pid=$! + + # wait until the server is alive + if wait_for_server; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + done + + # clean up + kill -9 $server_pid + kill_npu_processes + done +} + +cleanup() { + rm -rf ./vllm_benchmarks +} + +cleanup_on_error() { + echo "An error occurred. Cleaning up results folder..." + rm -rf $RESULTS_FOLDER +} + +main() { + START_TIME=$(date +%s) + check_npus + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which lsof) || (apt-get update && apt-get install -y lsof) + + # get the current IP address, required by benchmark_serving.py + # shellcheck disable=SC2155 + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # set env + export VLLM_USE_MODELSCOPE=True + + # prepare for benchmarking + cd benchmarks || exit 1 + trap cleanup EXIT + + QUICK_BENCHMARK_ROOT=./ + + declare -g RESULTS_FOLDER=results + mkdir -p $RESULTS_FOLDER + + trap cleanup_on_error ERR + ensure_sharegpt_downloaded + # benchmarks + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + END_TIME=$(date +%s) + ELAPSED_TIME=$((END_TIME - START_TIME)) + echo "Total execution time: $ELAPSED_TIME seconds" + +} + +main "$@" diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json new file mode 100644 index 0000000..40cec4c --- /dev/null +++ b/benchmarks/tests/latency-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "latency_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "max_model_len": 16384, + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json new file mode 100644 index 0000000..6398710 --- /dev/null +++ b/benchmarks/tests/serving-tests.json @@ -0,0 +1,77 @@ +[ + { + "test_name": "serving_qwen2_5vl_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "trust_remote_code": "", + "max_model_len": 16384 + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "endpoint_type": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_8B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000..3698e69 --- /dev/null +++ b/benchmarks/tests/throughput-tests.json @@ -0,0 +1,38 @@ +[ + { + "test_name": "throughput_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_qwen2_5vl_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "backend": "vllm-chat", + "dataset_name": "hf", + "hf_split": "train", + "max_model_len": 16384, + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] + diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 0000000..62078fd --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,133 @@ +# +# Attempt to find the python package that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python_EXECUTABLE ${EXECUTABLE}) + find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule) + if (NOT Python_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${_VER}) is not one of the supported versions: " + "${_SUPPORTED_VERSIONS_LIST}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +function (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(_PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) +endmacro() + + +# This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake +# Define a target named `GPU_MOD_NAME` for a single extension. The +# arguments are: +# +# DESTINATION - Module destination directory. +# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, +# etc. +# SOURCES - List of source files relative to CMakeLists.txt +# directory. +# +# Optional arguments: +# +# ARCHITECTURES - A list of target GPU architectures in cmake +# format. +# Refer `CMAKE_CUDA_ARCHITECTURES` documentation +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# ARCHITECTURES will use cmake's defaults if +# not provided. +# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. +# INCLUDE_DIRECTORIES - Extra include directories. +# LIBRARIES - Extra link libraries. +# WITH_SOABI - Generate library with python SOABI suffix name. +# USE_SABI - Use python stable api +# +# Note: optimization level/debug info is set via cmake build type. +# +function (define_gpu_extension_target GPU_MOD_NAME) + cmake_parse_arguments(PARSE_ARGV 1 + GPU + "WITH_SOABI" + "DESTINATION;LANGUAGE;USE_SABI" + "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") + + # Add hipify preprocessing step when building with HIP/ROCm. + if (GPU_LANGUAGE STREQUAL "HIP") + hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") + endif() + + if (GPU_WITH_SOABI) + set(GPU_WITH_SOABI WITH_SOABI) + else() + set(GPU_WITH_SOABI) + endif() + + if (GPU_USE_SABI) + Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}") + else() + Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}") + endif() + + if (GPU_LANGUAGE STREQUAL "HIP") + # Make this target dependent on the hipify preprocessor step. + add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) + endif() + + if (GPU_ARCHITECTURES) + set_target_properties(${GPU_MOD_NAME} PROPERTIES + ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") + endif() + + set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) + + target_compile_options(${GPU_MOD_NAME} PRIVATE + $<$:${GPU_COMPILE_FLAGS}>) + + target_compile_definitions(${GPU_MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") + + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) + + target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES}) + + # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of + # dependencies that are not necessary and may not be installed. + if (GPU_LANGUAGE STREQUAL "CUDA") + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) + else() + target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + endif() + + install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME}) +endfunction() diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..3bf401b --- /dev/null +++ b/codecov.yml @@ -0,0 +1,28 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +coverage: + status: + # Patch coverage is mandatory and must be >= 80% + patch: + default: + target: 80% + # non-voting + project: + default: + # non-voting + informational: true diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 0000000..68d97a7 --- /dev/null +++ b/collect_env.py @@ -0,0 +1,489 @@ +# +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from https://github.com/vllm-project/vllm/blob/main/collect_env.py +# + +import datetime +import locale +import os +import re +import subprocess +import sys +from collections import namedtuple + +from vllm.envs import environment_variables + +try: + import torch + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple( + 'SystemEnv', + [ + 'torch_version', + 'is_debug_build', + 'gcc_version', + 'clang_version', + 'cmake_version', + 'os', + 'libc_version', + 'python_version', + 'python_platform', + 'pip_version', # 'pip' or 'pip3' + 'pip_packages', + 'conda_packages', + 'cpu_info', + 'vllm_version', # vllm specific field + 'vllm_ascend_version', # vllm ascend specific field + 'env_vars', + 'npu_info', # ascend specific field + 'cann_info', # ascend specific field + ]) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "soumith", + "mkl", + "magma", + "optree", + "transformers", + "zmq", + "pynvml", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "optree", + "onnx", + "transformers", + "zmq", + "pynvml", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + p = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + err = raw_err.decode(enc) + return rc, output.strip(), err.strip() + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split('\n')[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get('CONDA_EXE', 'conda') + out = run_and_read_all(run_lambda, "{} list".format(conda)) + if out is None: + return out + + return "\n".join(line for line in out.splitlines() + if not line.startswith("#") and any(name in line + for name in patterns)) + + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)') + + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'clang --version', + r'clang version (.*)') + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'cmake --version', + r'cmake (.*)') + + +def _parse_version(version, version_tuple): + version_str = version_tuple[-1] + if isinstance(version_str, str) and version_str.startswith('g'): + if '.' in version_str: + git_sha = version_str.split('.')[0][1:] + date = version_str.split('.')[-1][1:] + return f"{version} (git sha: {git_sha}, date: {date})" + else: + git_sha = version_str[1:] # type: ignore + return f"{version} (git sha: {git_sha})" + return version + + +def get_vllm_version(): + from vllm import __version__, __version_tuple__ + return _parse_version(__version__, __version_tuple__) + + +def get_vllm_ascend_version(): + from vllm_ascend._version import __version__, __version_tuple__ + return _parse_version(__version__, __version_tuple__) + + +def get_cpu_info(run_lambda): + rc, out, err = 0, '', '' + if get_platform() == 'linux': + rc, out, err = run_lambda('lscpu') + elif get_platform() == 'win32': + rc, out, err = run_lambda( + 'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE' + ) + elif get_platform() == 'darwin': + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = 'None' + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith('linux'): + return 'linux' + elif sys.platform.startswith('win32'): + return 'win32' + elif sys.platform.startswith('cygwin'): + return 'cygwin' + elif sys.platform.startswith('darwin'): + return 'darwin' + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', + r'(.*)') + + +def get_windows_version(run_lambda): + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') + findstr_cmd = os.path.join(system_root, 'System32', 'findstr') + return run_and_read_all( + run_lambda, + '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'lsb_release -a', + r'Description:\t(.*)') + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, 'cat /etc/*-release', + r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + platform = get_platform() + + if platform == 'win32' or platform == 'cygwin': + return get_windows_version(run_lambda) + + if platform == 'darwin': + version = get_mac_version(run_lambda) + if version is None: + return None + return 'macOS {} ({})'.format(version, machine()) + + if platform == 'linux': + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + return '{} ({})'.format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + return platform.platform() + + +def get_libc_version(): + import platform + if get_platform() != 'linux': + return 'N/A' + return '-'.join(platform.libc_ver()) + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + # People generally have `pip` as `pip` or `pip3` + # But here it is invoked as `python -mpip` + def run_with_pip(pip): + out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) + return "\n".join(line for line in out.splitlines() + if any(name in line for name in patterns)) + + pip_version = 'pip3' if sys.version[0] == '3' else 'pip' + out = run_with_pip([sys.executable, '-mpip']) + + return pip_version, out + + +def get_npu_info(run_lambda): + return run_and_read_all(run_lambda, 'npu-smi info') + + +def get_cann_info(run_lambda): + out = run_and_read_all(run_lambda, 'lscpu | grep Architecture:') + cpu_arch = str(out).split()[-1] + return run_and_read_all( + run_lambda, + 'cat /usr/local/Ascend/ascend-toolkit/latest/{}-linux/ascend_toolkit_install.info' + .format(cpu_arch)) + + +def get_env_vars(): + env_vars = '' + secret_terms = ('secret', 'token', 'api', 'access', 'password') + report_prefix = ("TORCH", "PYTORCH", "ASCEND_", "ATB_") + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars + + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + else: + version_str = debug_mode_str = 'N/A' + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version='{} ({}-bit runtime)'.format( + sys_version, + sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + cpu_info=get_cpu_info(run_lambda), + vllm_version=get_vllm_version(), + vllm_ascend_version=get_vllm_ascend_version(), + env_vars=get_env_vars(), + npu_info=get_npu_info(run_lambda), + cann_info=get_cann_info(run_lambda), + ) + + +env_info_fmt = """ +PyTorch version: {torch_version} +Is debug build: {is_debug_build} + +OS: {os} +GCC version: {gcc_version} +Clang version: {clang_version} +CMake version: {cmake_version} +Libc version: {libc_version} + +Python version: {python_version} +Python platform: {python_platform} + +CPU: +{cpu_info} + +Versions of relevant libraries: +{pip_packages} +{conda_packages} +""".strip() + +# both the above code and the following code use `strip()` to +# remove leading/trailing whitespaces, so we need to add a newline +# in between to separate the two sections +env_info_fmt += "\n" + +env_info_fmt += """ +vLLM Version: {vllm_version} +vLLM Ascend Version: {vllm_ascend_version} + +ENV Variables: +{env_vars} + +NPU: +{npu_info} + +CANN: +{cann_info} +""".strip() + + +def pretty_str(envinfo): + + def replace_nones(dct, replacement='Could not collect'): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true='Yes', false='No'): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag='[prepend]'): + lines = text.split('\n') + updated_lines = [tag + line for line in lines] + return '\n'.join(updated_lines) + + def replace_if_empty(text, replacement='No relevant packages'): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split('\n')) > 1: + return '\n{}\n'.format(string) + return string + + mutable_dict = envinfo._asdict() + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict['pip_packages'] = replace_if_empty( + mutable_dict['pip_packages']) + mutable_dict['conda_packages'] = replace_if_empty( + mutable_dict['conda_packages']) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict['pip_packages']: + mutable_dict['pip_packages'] = prepend( + mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version)) + if mutable_dict['conda_packages']: + mutable_dict['conda_packages'] = prepend( + mutable_dict['conda_packages'], '[conda] ') + mutable_dict['cpu_info'] = envinfo.cpu_info + mutable_dict['npu_info'] = envinfo.npu_info + mutable_dict['cann_info'] = envinfo.cann_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr( + torch.utils, '_crash_handler'): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [ + os.path.join(minidump_dir, dump) + for dump in os.listdir(minidump_dir) + ] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime( + '%Y-%m-%d %H:%M:%S') + msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \ + "if this is related to your bug please include it when you file a report ***" + print(msg, file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/csrc/camem_allocator.cpp b/csrc/camem_allocator.cpp new file mode 100644 index 0000000..8cba79d --- /dev/null +++ b/csrc/camem_allocator.cpp @@ -0,0 +1,338 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +extern "C" { + +#define PY_SSIZE_T_CLEAN +#include + +#include +#include "acl/acl.h" + +// Global references to Python callables +// NOTE: this is borrowed reference, so we don't need to DECREF them. +// This brings the limitation that the allocator needs to be singleton. +static PyObject* g_python_malloc_callback = nullptr; +static PyObject* g_python_free_callback = nullptr; + + +// --------------------------------------------------------------------------- +// Helper functions: + +void ensure_context(unsigned long long device) { + aclrtContext pctx; + aclrtGetCurrentContext(&pctx); + if (!pctx) { + // Ensure device context. + aclrtCreateContext(&pctx, device); + aclrtSetCurrentContext(pctx); + } +} + +void create_and_map(unsigned long long device, ssize_t size, void* d_mem, + aclrtDrvMemHandle* p_memHandle) { + ensure_context(device); + // Define memory allocation properties + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.id = device; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.reserve = 0; + + // Allocate memory using aclrtMallocPhysical + aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return; + } + error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return; + } +} + +void unmap_and_release(unsigned long long device, ssize_t size, + void* d_mem, + aclrtDrvMemHandle* p_memHandle) { + // std::cout << "unmap_and_release: device=" << device << ", size=" << size << + // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; + ensure_context(device); + aclError error_code = aclrtUnmapMem(d_mem); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return; + } + error_code = aclrtFreePhysical(*p_memHandle); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return; + } +} + +PyObject* create_tuple_from_c_integers(unsigned long long a, + unsigned long long b, + unsigned long long c, + unsigned long long d) { + // Create a new tuple of size 4 + PyObject* tuple = PyTuple_New(4); + if (!tuple) { + return NULL; // Return NULL on failure + } + + // Convert integers to Python objects and set them in the tuple + PyTuple_SetItem( + tuple, 0, + PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong + PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b)); + PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c)); + PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d)); + + // Note: PyTuple_SetItem "steals" a reference to each object, + // so we do not need to Py_DECREF the PyLong objects explicitly. + + return tuple; // Return the created tuple +} + +// --------------------------------------------------------------------------- +// Our exported C functions that call Python: + +__attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, aclrtStream stream) { + ensure_context(device); + + // first allocation, align the size, and reserve an address, and also allocate + // a aclrtDrvMemHandle + + // Define memory allocation properties + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.id = device; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.reserve = 0; + + // Check if the allocation is supported + size_t granularity; + aclError error_code = aclrtMemGetAllocationGranularity(&prop, + ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, + &granularity); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return nullptr; + } + size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; + void *d_mem; + error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return nullptr; + } + // allocate the aclrtDrvMemHandle + aclrtDrvMemHandle* p_memHandle = + (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle)); + + if (!g_python_malloc_callback) { + std::cerr << "ERROR: g_python_malloc_callback not set.\n"; + return nullptr; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* arg_tuple = create_tuple_from_c_integers( + (unsigned long long)device, (unsigned long long)alignedSize, + (unsigned long long)d_mem, (unsigned long long)p_memHandle); + + // Call g_python_malloc_callback + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL); + Py_DECREF(arg_tuple); + + if (!py_result) { + PyErr_Print(); + PyGILState_Release(gstate); + return nullptr; + } + + PyGILState_Release(gstate); + + // do the final mapping + create_and_map(device, alignedSize, d_mem, p_memHandle); + + return (void*)d_mem; +} + +__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) { + // get memory handle from the pointer + if (!g_python_free_callback) { + std::cerr << "ERROR: g_python_free_callback not set.\n"; + return; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* py_ptr = + PyLong_FromUnsignedLongLong(reinterpret_cast(ptr)); + + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL); + + if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, + &recv_d_mem, &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return; + } + + PyGILState_Release(gstate); + + // recv_size == size + // recv_device == device + + // Free memory + + void *d_mem = (void*)recv_d_mem; + // allocate the aclrtDrvMemHandle + aclrtDrvMemHandle* p_memHandle = + (aclrtDrvMemHandle*)recv_p_memHandle; + unmap_and_release(device, size, d_mem, p_memHandle); + + // free address and the handle + aclError error_code = aclrtReleaseMemAddress(d_mem); + if (error_code != 0) { + std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; + return; + } + free(p_memHandle); +} + +// --------------------------------------------------------------------------- +// Python extension boilerplate: + +// Python-exposed function: init_module(python_malloc, python_free) +static PyObject* py_init_module(PyObject* self, PyObject* args) { + PyObject* malloc_callback = nullptr; + PyObject* free_callback = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { + return nullptr; + } + + if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be callables"); + return nullptr; + } + + // Save the Python callables + // This module does not handle GC of these objects, so they must be kept alive + // outside of this module. + g_python_malloc_callback = malloc_callback; + g_python_free_callback = free_callback; + + Py_RETURN_NONE; +} + +static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + void *d_mem_ptr = (void*)recv_d_mem; + aclrtDrvMemHandle* p_memHandle = + (aclrtDrvMemHandle*)recv_p_memHandle; + + unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyObject* python_create_and_map(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + void *d_mem_ptr = (void*)recv_d_mem; + aclrtDrvMemHandle* p_memHandle = + (aclrtDrvMemHandle*)recv_p_memHandle; + + create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyMethodDef module_methods[] = { + {"init_module", (PyCFunction)py_init_module, METH_VARARGS, + "Initialize module with python_malloc and python_free callables."}, + {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS, + "Create and map memory on the device."}, + {"python_unmap_and_release", (PyCFunction)python_unmap_and_release, + METH_VARARGS, "Unmap and release memory on the device."}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef camem_allocator_module = { + PyModuleDef_HEAD_INIT, "camem_allocator", + "CANN-mem-based allocator for NPUPluggableAllocator", -1, module_methods}; + +PyMODINIT_FUNC PyInit_vllm_ascend_C(void) { + // Initialize the module + PyObject* module = PyModule_Create(&camem_allocator_module); + if (!module) { + return NULL; + } + return module; +} +} // extern "C" diff --git a/csrc/kernels/bgmv_expand.cpp b/csrc/kernels/bgmv_expand.cpp new file mode 100644 index 0000000..c910005 --- /dev/null +++ b/csrc/kernels/bgmv_expand.cpp @@ -0,0 +1,369 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class BGMVExpand { +public: + using X_T = float; + using W_T = scalar_t; + using Y_T = scalar_t; + + static constexpr uint64_t LORA_RANK_8 = 8; + static constexpr uint64_t LORA_RANK_16 = 16; + static constexpr uint64_t LORA_RANK_32 = 32; + static constexpr uint64_t LORA_RANK_64 = 64; + static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64}; + static constexpr int32_t BUFFER_NUM = 2; + + // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time. + static constexpr int32_t NUM_BYTES_PER_REPEAT = 256; + static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8; + // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type). + static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Mask is used to control the elements that participate in computation in each iteration. + static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants. + static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192; + static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096; + static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT; + // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. + // So need to read them all and apply PairReduceSum + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = + (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16. + // Same for rank=64, we do not support ranks greater than 64. + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2; + +public: + __aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {} + + __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices, + uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut, + uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + maxLoRARank_ = maxLoRARank; + outputHiddenDim_ = outputHiddenDim; + sliceOffset_ = sliceOffset; + outputFullDim_ = outputFullDim; + singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut); + indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize); + + pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T)); + pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + + pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + + // Each compute iteration would generate not one, but several output elements. + // Therefore, the following variable would determine how many output elements are calculated in each iteration. + numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_); + numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_; + + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + yOffset_ = outputFullDim_ * idx + sliceOffset_; + + // Set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + CopyInX(idx); + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + for (int32_t i = 0; i < numStreamOut; i++) { + CopyInY(i); + for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) { + CopyInW(i * numStreamInPerOutputTile_ + j); + Compute(j * numOutputElementsPerInputTile_); + } + ScaleOutput(); + CopyOut(i); + } + ComputeLastIteration(); + } + } + +private: + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // Look up the LoRA index + reqLoRAIndex_ = indicesGm_.GetValue(idx); + } + + __aicore__ inline void ComputeLastIteration() + { + int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS; + if (remainingY == 0) { + return; + } + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + int32_t remainingW = remainingY * maxLoRARank_; + int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS; + int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS; + + CopyInY(numStreamOut, remainingY); + + int32_t outputIdx = 0; + for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx); + Compute(outputIdx * numOutputElementsPerInputTile_); + } + + if (remainingWForLastRepeat != 0) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration, + remainingWForLastRepeat); + int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat16 = + (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2; + int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_; + Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32); + } + + ScaleOutput(remainingY); + CopyOut(numStreamOut, remainingY); + } + + __aicore__ inline void CopyInX(const int64_t idx) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + if constexpr (std::is_same_v) { + DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_); + } else { + uint16_t blockLen = static_cast(maxLoRARank_ * sizeof(X_T)); + DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {}); + } + inQueueX_.EnQue(xLocal); + xLocal = inQueueX_.DeQue(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + + // As we are generating multiple output elements with one API invocation, + // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT + if constexpr (std::is_same_v) { + for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xLocal.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } else { + Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_); + pipe_barrier(PIPE_V); + + for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xDup.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } + inQueueX_.FreeTensor(xLocal); + } + + __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yInLocal = inQueueY_.AllocTensor(); + DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements); + inQueueY_.EnQue(yInLocal); + } + + __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements); + inQueueW_.EnQue(wLocal); + } + + __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor yInLocal = inQueueY_.DeQue(); + AscendC::LocalTensor yInLocalFP32 = inBufferY_.Get(); + Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueY_.FreeTensor(yInLocal); + + Add(yLocal, yLocal, yInLocalFP32, numElements); + pipe_barrier(PIPE_V); + + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void Compute(int32_t progress, + int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS, + int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16, + int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + + Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_); + pipe_barrier(PIPE_V); + + if (maxLoRARank_ == LORA_RANK_8) { + BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_16) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_32) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_64) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } + } + + __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe* pipe_; + AscendC::TQue inQueueY_, inQueueW_; + AscendC::TQue inQueueX_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor yInGm_; + AscendC::GlobalTensor yOutGm_; + AscendC::GlobalTensor indicesGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t maxLoRARank_; + uint32_t outputHiddenDim_; + uint32_t sliceOffset_; + uint32_t outputFullDim_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + uint32_t numOutputElementsPerInputTile_; + uint32_t numStreamInPerOutputTile_; + uint64_t yOffset_; + + // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously. + // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat, + // reads next 8 consecutive blocks in the second repeat. + AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4}; + + // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block, + // so we set dstRepStride = 1 + AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8}; + + // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks. + // For xDup we repeatedly use it, so we set src0RepStride = 0 + AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8}; + +}; + +#define BGMV_EXPAND_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\ + uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,\ + uint32_t batchSize, uint32_t numTokensPerCore, \ + uint32_t maxLoRARank, uint32_t outputHiddenDim, \ + uint32_t sliceOffset, uint32_t outputFullDim) \ + { \ + AscendC::TPipe pipe; \ + BGMVExpand op(&pipe); \ + op.Init(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank, \ + outputHiddenDim, sliceOffset, outputFullDim); \ + op.Process(); \ + } + +// declare all dtype kernel +BGMV_EXPAND_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + BGMV_EXPAND_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize, + void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + bgmv_expand_half<<>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, + maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + bgmv_expand_bfloat16_t<<>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, + numTokensPerCore, maxLoRARank, outputHiddenDim, + sliceOffset, outputFullDim); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/bgmv_shrink.cpp b/csrc/kernels/bgmv_shrink.cpp new file mode 100644 index 0000000..b5a2d15 --- /dev/null +++ b/csrc/kernels/bgmv_shrink.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class BGMVShrink { +public: + using X_T = scalar_t; + using W_T = scalar_t; + using Y_T = float; + + static constexpr uint64_t BUFFER_NUM = 1; + static constexpr uint64_t TILE_LENGTH = 11776; // optimal performance tile length + +public: + __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {} + __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, uint32_t indicesSize, __gm__ void *y, + uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + inputHiddenDim_ = inputHiddenDim; + maxLoRARank_ = maxLoRARank; + scale_ = scale; + singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_; + incremental_ = inputHiddenDim_ > TILE_LENGTH; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize); + + pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T)); + pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float)); + + pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T)); + pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + // set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + if (incremental_) { + ProcessImpl(idx); + } else { + ProcessImpl(idx); + } + + ScaleOutput(); + CopyOut(idx); + } + } + +private: + template + __aicore__ inline void ProcessImpl(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outBufferY_.Get(); + if constexpr (!INCREMENTAL_MODE) { + CopyInX(idx, 0, inputHiddenDim_); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + } + + for (int i = 0; i < maxLoRARank_; i++) { + float acc(0); + for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) { + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, j); + } + CopyInW(i, j); + Compute(acc); + } + CopyAndComputeLastIteration(idx, i, acc); + yOutLocal.SetValue(i, acc); + } + } + + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // look up the LoRA index + reqLoRAIndex_ = indicesGm_.GetValue(idx); + } + + __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements); + inQueueX_.EnQue(xLocal); + } + + __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements); + inQueueW_.EnQue(wLocal); + } + + template + __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + if constexpr (INCREMENTAL_MODE) { + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements); + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + inQueueW_.FreeTensor(wLocal); + } else { + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + } + // dot product of the one tile of X and W + Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + // reduce sum generate one number, which is the summation of all the dot product + ReduceSum(wTmpTensor, wTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + + acc += wTmpTensor.GetValue(0); + } + + template + __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc) + { + int32_t colIdx = inputHiddenDim_ / TILE_LENGTH; + int32_t remaining = inputHiddenDim_ % TILE_LENGTH; + if (remaining == 0) { + return; + } + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, colIdx, remaining); + } + CopyInW(rowIdx, colIdx, remaining); + Compute(acc, remaining); + } + + __aicore__ inline void ScaleOutput() + { + AscendC::LocalTensor yLocal = outBufferY_.Get(); + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + + Muls(yOutLocal, yLocal, scale_, maxLoRARank_); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void CopyOut(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe *pipe_; + AscendC::TQue inQueueX_, inQueueW_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferX_, tmpBufferW_, outBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor indicesGm_; + AscendC::GlobalTensor yOutGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t inputHiddenDim_; + uint32_t maxLoRARank_; + float scale_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + bool incremental_; +}; + +#define BGMV_SHRINK_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\ + uint32_t indicesSize, __gm__ void* y, uint32_t batchSize, \ + uint32_t numTokensPerCore, uint32_t inputHiddenDim, \ + uint32_t maxLoRARank, float scale) \ + { \ + AscendC::TPipe pipe; \ + BGMVShrink op(&pipe); \ + op.Init(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale); \ + op.Process(); \ + } + +// declare all dtype kernel +BGMV_SHRINK_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + BGMV_SHRINK_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize, + void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + bgmv_shrink_half<<>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, + inputHiddenDim, maxLoRARank, scale); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + bgmv_shrink_bfloat16_t<<>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, + inputHiddenDim, maxLoRARank, scale); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/get_masked_input_and_mask_kernel.cpp b/csrc/kernels/get_masked_input_and_mask_kernel.cpp new file mode 100644 index 0000000..25aeb60 --- /dev/null +++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + */ + +#include "kernel_operator.h" +#include "kernel_tensor_impl.h" +#include "kernel_type.h" +#include "types.h" +#include "utils.h" +using vllm_ascend::AccType; + +template +class GetMaskedInputAndMask { +public: + __aicore__ inline GetMaskedInputAndMask() {} + + __aicore__ inline ~GetMaskedInputAndMask() { + pipe.Reset(); + } + + + __aicore__ inline void Init( + __gm__ scalar_t* input, + __gm__ scalar_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size) + { + // Initialize basic parameters + input_ = input; + masked_input_ = masked_input; + mask_out_ = mask_out; + org_vocab_start_index_ = org_vocab_start_index; + org_vocab_end_index_ = org_vocab_end_index; + size_ = ((size + 31) / 32) * 32; + added_offset_ = added_vocab_start_index - + (org_vocab_end_index - org_vocab_start_index) - + num_org_vocab_padding; + added_vocab_start_index_ = added_vocab_start_index; + added_vocab_end_index_ = added_vocab_end_index; + + // Initialize global tensors + inputGlobal.SetGlobalBuffer(input); + maskedOutputGlobal.SetGlobalBuffer(masked_input); + maskOutGlobal.SetGlobalBuffer(mask_out); + + // Initialize queues + pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t)); + pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool)); + + // Initialize calculation buffers + // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs. + pipe.InitBuffer(calc_buf_1, size_ * sizeof(float)); + pipe.InitBuffer(calc_buf_2, size_ * sizeof(float)); + + // Initialize result queues + pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float)); + pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float)); + + // Initialize temporary buffers + pipe.InitBuffer(start_buf, size_ * sizeof(float)); + pipe.InitBuffer(end_buf, size_ * sizeof(float)); + pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting + pipe.InitBuffer(validOffset_buf, size_ * sizeof(float)); + pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t)); + pipe.InitBuffer(ones_buf_, size_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor inputLocal = inQueue.AllocTensor(); + AscendC::DataCopy(inputLocal, inputGlobal, size_); + inQueue.EnQue(inputLocal); + } + + __aicore__ inline void CompareWithValue( + AscendC::LocalTensor& result, + const AscendC::LocalTensor& input, + const AscendC::LocalTensor& compare_value, + bool is_greater_equal) { + + AscendC::LocalTensor compute_buf = calc_buf_1.Get(); + if (is_greater_equal) { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compare_value, compute_buf, size_); + } else { + AscendC::Max(compute_buf, input, compare_value, size_); + AscendC::Sub(compute_buf, compute_buf, compare_value, size_); + } + + AscendC::Abs(compute_buf, compute_buf, size_); + AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_); + AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_); + AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_); + AscendC::Abs(compute_buf, compute_buf, size_); + + AscendC::LocalTensor compute_buf_fp16 = calc_buf_2.Get(); + AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_); + } + + __aicore__ inline void ComputeRangeMask( + AscendC::LocalTensor& range_mask, + const AscendC::LocalTensor& input, + const float start_value, + const float end_value) { + + AscendC::LocalTensor start_value_tensor = start_buf.Get(); + AscendC::LocalTensor end_value_tensor = end_buf.Get(); + + AscendC::Duplicate(start_value_tensor, start_value, size_); + AscendC::Duplicate(end_value_tensor, end_value, size_); + + AscendC::LocalTensor ge_result = result_ge_que.AllocTensor(); + AscendC::LocalTensor lt_result = result_le_que.AllocTensor(); + + CompareWithValue(ge_result, start_value_tensor, input, true); + CompareWithValue(lt_result, input, end_value_tensor, false); + +#if (__CCE_AICORE__ >= 220) + AscendC::And(range_mask, ge_result, lt_result, size_); +#else + { + // WORKAROUND for older arch + // No direct int8->int16 cast. Use half as intermediate. + // No direct int8 And. Use int16 And. + AscendC::LocalTensor ge_result_i16 = calc_buf_1.Get(); + AscendC::LocalTensor lt_result_i16 = calc_buf_2.Get(); + AscendC::LocalTensor range_mask_i16 = ge_result_i16; + + // Use a temporary buffer for half type + AscendC::LocalTensor tmp_half = inputFloat_buf.Get(); + + // 1. Cast inputs: int8_t -> half -> int16_t + AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + // 2. Perform And on int16_t tensors + AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_); + + // 3. Cast result back: int16_t -> half -> int8_t + AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + } +#endif + } + + __aicore__ inline void Compute() { + AscendC::LocalTensor inputLocal = inQueue.DeQue(); + AscendC::LocalTensor maskedLocal = outQueue.AllocTensor(); + AscendC::LocalTensor maskLocal = maskQueue.AllocTensor(); + + AscendC::LocalTensor inputFloat = inputFloat_buf.Get(); + AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::LocalTensor orgVocabMask = result_org_mask_que.AllocTensor(); + ComputeRangeMask(orgVocabMask, + inputFloat, + static_cast(org_vocab_start_index_), + static_cast(org_vocab_end_index_)); + + AscendC::LocalTensor addedVocabMask = result_add_mask_que.AllocTensor(); + ComputeRangeMask(addedVocabMask, + inputFloat, + static_cast(added_vocab_start_index_), + static_cast(added_vocab_end_index_)); + + AscendC::LocalTensor validOffset = validOffset_buf.Get(); + AscendC::LocalTensor constOrgStartIndex = start_buf.Get(); + + AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_); + + AscendC::LocalTensor orgVocabMask_fp16; + AscendC::LocalTensor orgVocabMask_fp32; + AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_); + + AscendC::LocalTensor addedOffset; + AscendC::LocalTensor addedOffsetTensor = end_buf.Get(); + AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_); + + AscendC::LocalTensor addedVocabMask_fp16; + AscendC::LocalTensor addedVocabMask_fp32; + AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_); + AscendC::Add(validOffset, validOffset, addedOffset, size_); + + AscendC::LocalTensor vocabMask = vocabMask_buf_.Get(); + +#if (__CCE_AICORE__ >= 220) + AscendC::Or(vocabMask, + orgVocabMask, + addedVocabMask, + size_); +#else + { + // WORKAROUND for older arch + // No direct int8->int16 cast. Use half as intermediate. + // No direct int8 Or. Use int16 Or. + AscendC::LocalTensor orgVocabMask_i16 = calc_buf_1.Get(); + AscendC::LocalTensor addedVocabMask_i16 = calc_buf_2.Get(); + AscendC::LocalTensor vocabMask_i16 = orgVocabMask_i16; + + // Use a temporary buffer for half type. inputFloat_buf is free now. + AscendC::LocalTensor tmp_half = inputFloat_buf.Get(); + + // 1. Cast inputs: int8_t -> half -> int16_t + AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + + // 2. Perform Or on int16_t tensors + AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_); + + // 3. Cast result back: int16_t -> half -> int8_t + AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_); + } +#endif + + AscendC::Sub(inputFloat, inputFloat, validOffset, size_); + + AscendC::LocalTensor vocabMask_fp16; + AscendC::LocalTensor vocabMask_fp32; + AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_); + + AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_); + + AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_); + outQueue.EnQue(maskedLocal); + + AscendC::LocalTensor ones_tensor = ones_buf_.Get(); + AscendC::Duplicate(ones_tensor, (float)1, size_); + AscendC::LocalTensor maskLocal_fp32; + + AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_); + + AscendC::LocalTensor maskLocal_fp16; + AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_); + AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_); + maskQueue.EnQue(maskLocal); + inQueue.FreeTensor(inputLocal); + } + + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor maskedLocal = outQueue.DeQue(); + AscendC::LocalTensor maskLocal = maskQueue.DeQue(); + + AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_); + AscendC::DataCopy(maskOutGlobal, maskLocal, size_); + + outQueue.FreeTensor(maskedLocal); + maskQueue.FreeTensor(maskLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + AscendC::TPipe pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue, maskQueue; + AscendC::GlobalTensor inputGlobal, maskedOutputGlobal; + AscendC::GlobalTensor maskOutGlobal; + AscendC::TBuf calc_buf_1; + AscendC::TBuf calc_buf_2; + AscendC::TQue result_ge_que; + AscendC::TQue result_le_que; + AscendC::TQue result_org_mask_que; + AscendC::TQue result_add_mask_que; + + // Temporary buffers + AscendC::TBuf start_buf; + AscendC::TBuf end_buf; + AscendC::TBuf inputFloat_buf; + AscendC::TBuf validOffset_buf; + AscendC::TBuf vocabMask_buf_; + AscendC::TBuf ones_buf_; + + __gm__ scalar_t *input_, *masked_input_; + __gm__ bool *mask_out_; + int64_t size_; + int64_t org_vocab_start_index_, org_vocab_end_index_; + int64_t added_vocab_start_index_, added_vocab_end_index_; + int64_t added_offset_; + + static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38; + static constexpr float MAX_MUL_1_FP32 = 1125899906842624; + static constexpr float MAX_MUL_2_FP32 = 67108864; + static constexpr float NEGATIVE_ONE_FP32 = -1.0f; +}; + +extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel( + __gm__ int32_t* input, + __gm__ int32_t* masked_input, + __gm__ bool* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + { + GetMaskedInputAndMask op{}; + + for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) { + op.Init(input + i * size/loop_cnt, + masked_input + i * size/loop_cnt, + mask_out + i * size/loop_cnt, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index, size/loop_cnt); + + op.Process(); + } + } // op destructor called here +} + +namespace vllm_ascend { + +void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num) +{ + get_masked_input_and_mask_kernel<<>>( + static_cast(input), + static_cast(masked_input), + static_cast(mask_out), + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aiv_num); +} + +} // namespace vllm_ascend diff --git a/csrc/kernels/pos_encoding_kernels.cpp b/csrc/kernels/pos_encoding_kernels.cpp new file mode 100644 index 0000000..69a1519 --- /dev/null +++ b/csrc/kernels/pos_encoding_kernels.cpp @@ -0,0 +1,372 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include +#include "types.h" +#include "utils.h" + + +using vllm_ascend::AccType; +using vllm_ascend::local_mem_copy; +template class RotaryEmbedding { + // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to + // retrieve this size from runtime for more Soc support + #if (__CCE_AICORE__ >= 220) + static int constexpr loadSize = 512; + #else + static int constexpr loadSize = 1024 * 4; + #endif + using dst_t = scalar_t; + using acc_t = typename AccType::type; + // only half tensor have cast instruct to int8, hardcode acc_dst_t as half + using local_scalar_t = AscendC::LocalTensor; + using local_acc_t = AscendC::LocalTensor; + using local_dst_t = AscendC::LocalTensor; + +public: + __aicore__ inline RotaryEmbedding() + { + } + + // Allocate buffers for input and output queue and the temp buffer used during kernel compute process, + // this init process happens only in the kernel compute on a single vector core. + __aicore__ inline void init(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst, + __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache, + const int rotDim, const int64_t dstQueryStride, + const int64_t dstKeyStride, const int64_t queryStride, const int64_t keyStride, + const int numHeads, const int numKvHeads, const int headSize, AscendC::TPipe *pipe) + { + pipe_ = pipe; + rotDim_ = rotDim; + // query stride and key stride is used to handle the strided tensor which is not contiguous on num_tokens dim + queryStride_ = queryStride; + keyStride_ = keyStride; + dstQueryStride_ = dstQueryStride; + dstKeyStride_ = dstKeyStride; + numHeads_ = numHeads; + numKvHeads_ = numKvHeads; + headSize_ = headSize; + embedDim_ = rotDim / 2; + + pipe_->InitBuffer(inQue_, 1 /* buffer_num */, loadSize /* buffer_size */); + pipe_->InitBuffer(inQueSinCos_, 1 /* buffer_num */, rotDim_ * sizeof(scalar_t) /* buffer_size */); + pipe_->InitBuffer(outQue_, 1 /* buffer_num */, loadSize /* buffer_size */); + // 2 temporary calculation buffer + calcTmpBufferOffset_ = 0; + // 1 upcast buffer for bf16 (headSize) + upcastInputBufferOffset_ = calcTmpBufferOffset_ + sizeof(acc_t) * embedDim_ * 2; + // 1 upcast temp buffer for bf16 (2 * embed_dim) + upcastTempBufferOffset_ = upcastInputBufferOffset_ + sizeof(acc_t) * headSize_; + // 2 sin cos upcast buffer for bf16 + cosSinUpcastBufferOffset_ = upcastTempBufferOffset_ + sizeof(acc_t) * 2 * embedDim_; + // 2. bf16 path: needs 2 cos sin upcast buffer size + // 3. fp16 path: needs 2 temporary calculation buffer size + tempBufferSize_ = cosSinUpcastBufferOffset_ + 2 * embedDim_ * sizeof(acc_t); + // need to consider upcast the bf16 to fp32, so we might need 4 buffer just in case + // 2 temporary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp + // buffer(headSize int8), 1 dst_temp buffer(headSize, int32) + pipe_->InitBuffer(calcBuf_, tempBufferSize_ /* buffer_size */); + if constexpr (!std::is_same_v) { + pipe_->InitBuffer(copyBuf_, loadSize); + } + } + __aicore__ inline void update_mem_offset(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst, + __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache, + const int rotDim, const int64_t dstQueryStride, const int64_t dstKeyStride, + const int64_t queryStride, const int64_t keyStride, const int numHeads, + const int numKvHeads, const int headSize, const int64_t idx) + { + int64_t pos = positions[idx]; + cosSin_.SetGlobalBuffer(cosSinCache + pos * rotDim_, rotDim_); + query_.SetGlobalBuffer(query + queryStride * idx, headSize * numHeads_); + key_.SetGlobalBuffer(key + keyStride * idx, headSize * numKvHeads_); + queryDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(queryDst) + dstQueryStride * idx, + headSize * numHeads_); + keyDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(keyDst) + dstKeyStride * idx, headSize * numKvHeads_); + } + + // compute per head for neox on bf16 + template , void>::type * = nullptr> + __aicore__ inline void + neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor sin, AscendC::LocalTensor cos, + AscendC::LocalTensor upcastInputBuffer, AscendC::LocalTensor calcTmpBuffer) + { + // slice dst + local_dst_t dstX = dst; + local_dst_t dstY = dst[embedDim_]; + + // slice src + local_scalar_t srcX = src; + local_scalar_t srcY = src[embedDim_]; + + // slice temp buffer + local_acc_t calcTmpBufferX = calcTmpBuffer; + local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_]; + + // slice upcast input buffer + local_acc_t upcastBufferX = upcastInputBuffer; + local_acc_t upcastBufferY = upcastBufferX[embedDim_]; + + // dst x calc + Cast(upcastInputBuffer, src, AscendC::RoundMode::CAST_NONE, headSize_); + Mul(calcTmpBufferX, upcastBufferX, cos, embedDim_); + Mul(calcTmpBufferY, upcastBufferY, sin, embedDim_); + Sub(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_); + Cast(dstX, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_); + + // dst y calc + Mul(calcTmpBufferX, upcastBufferX, sin, embedDim_); + Mul(calcTmpBufferY, upcastBufferY, cos, embedDim_); + Add(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_); + Cast(dstY, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_); + } + + // compute per head output for neox + template , void>::type * = nullptr> + __aicore__ inline void + neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor sin, AscendC::LocalTensor cos, + AscendC::LocalTensor upcastInputBuffer, AscendC::LocalTensor calcTmpBuffer) + { + // slice dst buffer + local_dst_t dstX = dst; + local_dst_t dstY = dst[embedDim_]; + // slice src buffer + local_scalar_t srcX = src; + local_scalar_t srcY = src[embedDim_]; + // slice temp buffer + local_acc_t calcTmpBufferX = calcTmpBuffer; + local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_]; + + // dst x calc + Mul(calcTmpBufferX, srcX, cos, embedDim_); + Mul(calcTmpBufferY, srcY, sin, embedDim_); + Sub(dstX, calcTmpBufferX, calcTmpBufferY, embedDim_); + + // dst y calc + Mul(calcTmpBufferX, srcX, sin, embedDim_); + Mul(calcTmpBufferY, srcY, cos, embedDim_); + Add(dstY, calcTmpBufferX, calcTmpBufferY, embedDim_); + } + + __aicore__ inline void compute_qk(AscendC::GlobalTensor srcG, AscendC::GlobalTensor dstG, + local_acc_t localCos, local_acc_t localSin, local_acc_t upcastInputBuffer, + local_acc_t calcTmpBuffer, int loopCnt, int tailHeads, int loadStride, + int headNumPerLoad) + { + for (int loopNum = 0; loopNum < loopCnt; ++loopNum) { + local_scalar_t src = inQue_.AllocTensor(); + local_dst_t dst = outQue_.AllocTensor(); + AscendC::DataCopy(src, srcG[loopNum * loadStride], loadStride); + inQue_.EnQue(src); + + local_scalar_t srcDeque = inQue_.DeQue(); + if constexpr (!std::is_same_v) { + int elem_num = loadStride / sizeof(scalar_t); + AscendC::LocalTensor upBuffer = copyBuf_.GetWithOffset(elem_num, 0); + Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num); + Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num); + } else { + local_mem_copy(dst, srcDeque, loadStride); + } + for (int i = 0; i < headNumPerLoad; ++i) { + neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer, + calcTmpBuffer); + } + outQue_.EnQue(dst); + local_dst_t dstDeque = outQue_.DeQue(); + AscendC::DataCopy(dstG[loopNum * loadStride], dstDeque, loadStride); + outQue_.FreeTensor(dstDeque); + inQue_.FreeTensor(srcDeque); + } + // process tail + { + local_scalar_t src = inQue_.AllocTensor(); + local_dst_t dst = outQue_.AllocTensor(); + + AscendC::DataCopy(src, srcG[loopCnt * loadStride], tailHeads * headSize_); + inQue_.EnQue(src); + local_scalar_t srcDeque = inQue_.DeQue(); + + if constexpr (!std::is_same_v) { + int elem_num = tailHeads * headSize_ / sizeof(scalar_t); + AscendC::LocalTensor upBuffer = copyBuf_.GetWithOffset(elem_num, 0); + Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num); + Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num); + } else { + local_mem_copy(dst, srcDeque, tailHeads * headSize_); + } + + for (int i = 0; i < tailHeads; ++i) { + neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer, + calcTmpBuffer); + } + outQue_.EnQue(dst); + local_dst_t dstDeque = outQue_.DeQue(); + AscendC::DataCopy(dstG[loopCnt * loadStride], dstDeque, tailHeads * headSize_); + outQue_.FreeTensor(dstDeque); + inQue_.FreeTensor(srcDeque); + } + } + + __aicore__ inline void compute_function() + { + local_scalar_t cosSinLocal = inQueSinCos_.AllocTensor(); + + AscendC::DataCopy(cosSinLocal, cosSin_, embedDim_ * 2); + + inQueSinCos_.EnQue(cosSinLocal); + local_scalar_t localSinCosDeque = inQueSinCos_.DeQue(); + local_scalar_t localCos = localSinCosDeque; + local_scalar_t localSin = localSinCosDeque[embedDim_]; + + local_acc_t calcTmpBuffer; + local_acc_t upcastInputBuffer; + local_acc_t upcastTempBuffer; + local_acc_t cosSinUpcastBuffer; + local_acc_t scaleBuffer; + local_acc_t offsetBuffer; + calcTmpBuffer = calcBuf_.GetWithOffset(embedDim_ * 2, calcTmpBufferOffset_); + upcastInputBuffer = calcBuf_.GetWithOffset(headSize_, upcastInputBufferOffset_); + upcastTempBuffer = calcBuf_.GetWithOffset(embedDim_ * 2, upcastTempBufferOffset_); + cosSinUpcastBuffer = calcBuf_.GetWithOffset(embedDim_ * 2, cosSinUpcastBufferOffset_); + + local_acc_t cosAccBuffer; + local_acc_t sinAccBuffer; + + if constexpr (!std::is_same_v) { + Cast(cosSinUpcastBuffer, localSinCosDeque, AscendC::RoundMode::CAST_NONE, 2 * embedDim_); + cosAccBuffer = cosSinUpcastBuffer; + sinAccBuffer = cosSinUpcastBuffer[embedDim_]; + } else { + cosAccBuffer = localCos; + sinAccBuffer = localSin; + } + + constexpr const int loadSizeByElem = loadSize / sizeof(scalar_t); + int64_t headNumPerLoad = loadSizeByElem / headSize_; + int64_t loopCnt = numHeads_ / headNumPerLoad; + int64_t tailHeads = numHeads_ - loopCnt * headNumPerLoad; + int64_t loadStride = headNumPerLoad * headSize_; + int64_t loopCntKv = numKvHeads_ / headNumPerLoad; + int64_t tailHeadsKv = numKvHeads_ - loopCntKv * headNumPerLoad; + compute_qk(query_, queryDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer, + calcTmpBuffer, loopCnt, tailHeads, loadStride, headNumPerLoad); + + compute_qk(key_, keyDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer, calcTmpBuffer, + loopCntKv, tailHeadsKv, loadStride, headNumPerLoad); + + inQueSinCos_.FreeTensor(localSinCosDeque); + } + +private: + AscendC::TPipe *pipe_; + AscendC::TQue inQue_, inQueSinCos_; + AscendC::TQue outQue_; + AscendC::TBuf calcBuf_; + AscendC::TBuf copyBuf_; + AscendC::GlobalTensor queryDst_; + AscendC::GlobalTensor keyDst_; + AscendC::GlobalTensor query_; + AscendC::GlobalTensor key_; + AscendC::GlobalTensor cosSin_; + int rotDim_; + int embedDim_; + int64_t queryStride_; + int64_t keyStride_; + int64_t dstQueryStride_; + int64_t dstKeyStride_; + int numHeads_; + int numKvHeads_; + int headSize_; + int calcTmpBufferOffset_; + int upcastInputBufferOffset_; + int upcastTempBufferOffset_; + int cosSinUpcastBufferOffset_; + int tempBufferSize_; +}; + +// Note: Need to use macro to instaniate all the target functions here, for the current build system dose not support template call in cpp +// We use C style symbol here for kernel compilation, cpp style kernel entry may lead to compilation failure +#define ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, NEOX) \ + extern "C" __global__ __aicore__ void rope_custom_##NEOX##_##TYPE( \ + __gm__ int64_t* positions, __gm__ void* queryDst, __gm__ void* keyDst, __gm__ TYPE* query, __gm__ TYPE* key, \ + __gm__ TYPE* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, \ + const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, \ + const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) \ + { \ + AscendC::TPipe pipe; \ + RotaryEmbedding op{}; \ + op.init(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride, \ + queryStride, keyStride, numHeads, numKvHeads, headSize, &pipe); \ + for (int64_t i = AscendC::GetBlockIdx(); i < numTokens; i += coreNum) { \ + op.update_mem_offset(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride, \ + queryStride, keyStride, numHeads, numKvHeads, headSize, i); \ + op.compute_function(); \ + } \ + } + +#define ROPE_CUSTOM_KERNEL_DECLARE(TYPE) \ + ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, true); \ + ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, false); + +// Declare all the kernel entry here +ROPE_CUSTOM_KERNEL_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { + +#define ROTARY_EMBEDDING_KERNEL_CALL(TYPE) \ + if (isNeox) \ + rope_custom_true_##TYPE<<>>( \ + positions, queryDst, keyDst, reinterpret_cast(query), reinterpret_cast(key), \ + reinterpret_cast(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \ + numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim); \ + else \ + rope_custom_false_##TYPE<<>>( \ + positions, queryDst, keyDst, reinterpret_cast(query), reinterpret_cast(key), \ + reinterpret_cast(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \ + numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim); + +// maximum number for runtime to launch a ascendc kernel. +// we use this to constrain the maximum number of block size +static const int64_t maxParallelSize = 65535; + +extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst, + void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim, + const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, + const int64_t dstKeyStride, const int numHeads, const int numKvHeads, + const int headSize, const int64_t numTokens, const uint32_t loopCnt, + uint32_t aivNum) +{ + + int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize; + if (type == AscendType::FP16) { + ROTARY_EMBEDDING_KERNEL_CALL(half); + } + #if (__CCE_AICORE__ >= 220) + else if (type == AscendType::BF16) { + ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t); + } + #endif + else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/sgmv_expand.cpp b/csrc/kernels/sgmv_expand.cpp new file mode 100644 index 0000000..5466bd6 --- /dev/null +++ b/csrc/kernels/sgmv_expand.cpp @@ -0,0 +1,389 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class SGMVExpand { +public: + using X_T = float; + using W_T = scalar_t; + using Y_T = scalar_t; + + static constexpr uint64_t LORA_RANK_8 = 8; + static constexpr uint64_t LORA_RANK_16 = 16; + static constexpr uint64_t LORA_RANK_32 = 32; + static constexpr uint64_t LORA_RANK_64 = 64; + static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64}; + static constexpr int32_t BUFFER_NUM = 2; + + // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time. + static constexpr int32_t NUM_BYTES_PER_REPEAT = 256; + static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8; + // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type). + static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Mask is used to control the elements that participate in computation in each iteration. + static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float); + // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants. + static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192; + static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096; + static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT; + // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. + // So need to read them all and apply PairReduceSum + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = + (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16. + // Same for rank=64, we do not support ranks greater than 64. + static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2; + +public: + __aicore__ inline SGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {} + + __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* loraIndices, uint32_t loraIndicesSize, + __gm__ void* seqLen, uint32_t seqLenSize, __gm__ void* yIn, __gm__ void* yOut, + uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + maxLoRARank_ = maxLoRARank; + outputHiddenDim_ = outputHiddenDim; + sliceOffset_ = sliceOffset; + outputFullDim_ = outputFullDim; + singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut); + loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize); + seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize); + + pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T)); + pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T)); + + pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float)); + + // Each compute iteration would generate not one, but several output elements. + // Therefore, the following variable would determine how many output elements are calculated in each iteration. + numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_); + numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_; + + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + yOffset_ = outputFullDim_ * idx + sliceOffset_; + + // Set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + CopyInX(idx); + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + for (int32_t i = 0; i < numStreamOut; i++) { + CopyInY(i); + for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) { + CopyInW(i * numStreamInPerOutputTile_ + j); + Compute(j * numOutputElementsPerInputTile_); + } + ScaleOutput(); + CopyOut(i); + } + ComputeLastIteration(); + } + } + +private: + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // Look up the LoRA index + int64_t weightIdx = idx; + uint64_t i = 0; + for (; i < seqLenGm_.GetSize(); i++) { + int64_t repeatValue = seqLenGm_.GetValue(i); + if (weightIdx >= repeatValue) { + weightIdx -= repeatValue; + continue; + } + break; + } + reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1; + } + + __aicore__ inline void ComputeLastIteration() + { + int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS; + if (remainingY == 0) { + return; + } + int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS; + int32_t remainingW = remainingY * maxLoRARank_; + int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS; + int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS; + + CopyInY(numStreamOut, remainingY); + + int32_t outputIdx = 0; + for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx); + Compute(outputIdx * numOutputElementsPerInputTile_); + } + + if (remainingWForLastRepeat != 0) { + CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration, + remainingWForLastRepeat); + int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat16 = + (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT; + int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2; + int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_; + Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32); + } + + ScaleOutput(remainingY); + CopyOut(numStreamOut, remainingY); + } + + __aicore__ inline void CopyInX(const int64_t idx) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + if constexpr (std::is_same_v) { + DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_); + } else { + uint16_t blockLen = static_cast(maxLoRARank_ * sizeof(X_T)); + DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {}); + } + inQueueX_.EnQue(xLocal); + xLocal = inQueueX_.DeQue(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + + // As we are generating multiple output elements with one API invocation, + // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT + if constexpr (std::is_same_v) { + for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xLocal.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } else { + Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_); + pipe_barrier(PIPE_V); + + for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) { + for (int32_t j = 0; j < maxLoRARank_; j++) { + float entry = xDup.GetValue(j); + xDup.SetValue(i + j, entry); + } + } + } + inQueueX_.FreeTensor(xLocal); + } + + __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yInLocal = inQueueY_.AllocTensor(); + DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements); + inQueueY_.EnQue(yInLocal); + } + + __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements); + inQueueW_.EnQue(wLocal); + } + + __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor yInLocal = inQueueY_.DeQue(); + AscendC::LocalTensor yInLocalFP32 = inBufferY_.Get(); + Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueY_.FreeTensor(yInLocal); + + Add(yLocal, yLocal, yInLocalFP32, numElements); + pipe_barrier(PIPE_V); + + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void Compute(int32_t progress, + int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS, + int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16, + int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32) + { + AscendC::LocalTensor yLocal = tmpBufferY_.Get(); + AscendC::LocalTensor xDup = dupBufferX_.Get(); + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + + Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_); + pipe_barrier(PIPE_V); + + if (maxLoRARank_ == LORA_RANK_8) { + BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_16) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_32) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } else if (maxLoRARank_ == LORA_RANK_64) { + BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT, + reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride); + pipe_barrier(PIPE_V); + } + } + + __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe* pipe_; + AscendC::TQue inQueueY_, inQueueW_; + AscendC::TQue inQueueX_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor yInGm_; + AscendC::GlobalTensor yOutGm_; + AscendC::GlobalTensor loraIndicesGm_; + AscendC::GlobalTensor seqLenGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t maxLoRARank_; + uint32_t outputHiddenDim_; + uint32_t sliceOffset_; + uint32_t outputFullDim_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + uint32_t numOutputElementsPerInputTile_; + uint32_t numStreamInPerOutputTile_; + uint64_t yOffset_; + + // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously. + // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat, + // reads next 8 consecutive blocks in the second repeat. + AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4}; + + // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block, + // so we set dstRepStride = 1 + AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8}; + + // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks. + // For xDup we repeatedly use it, so we set src0RepStride = 0 + AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8}; + +}; + +#define SGMV_EXPAND_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void sgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, \ + __gm__ void* loraIndices, uint32_t loraIndicesSize, \ + __gm__ void* seqLen, uint32_t seqLenSize, \ + __gm__ void* yIn, __gm__ void* yOut, \ + uint32_t batchSize, uint32_t numTokensPerCore, \ + uint32_t maxLoRARank, uint32_t outputHiddenDim, \ + uint32_t sliceOffset, uint32_t outputFullDim) \ + { \ + AscendC::TPipe pipe; \ + SGMVExpand op(&pipe); \ + op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, \ + yIn, yOut, batchSize, numTokensPerCore, maxLoRARank, \ + outputHiddenDim, sliceOffset, outputFullDim); \ + op.Process(); \ + } + +// declare all dtype kernel +SGMV_EXPAND_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + SGMV_EXPAND_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, + void* loraIndices, uint32_t loraIndicesSize, + void* seqLen, uint32_t seqLenSize, + void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank, + uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + sgmv_expand_half<<>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, + yIn, yOut, batchSize, + numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, + outputFullDim); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + sgmv_expand_bfloat16_t<<>>(x, weight, loraIndices, loraIndicesSize, + seqLen, seqLenSize, yIn, yOut, batchSize, + numTokensPerCore, maxLoRARank, outputHiddenDim, + sliceOffset, outputFullDim); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/sgmv_shrink.cpp b/csrc/kernels/sgmv_shrink.cpp new file mode 100644 index 0000000..a72e592 --- /dev/null +++ b/csrc/kernels/sgmv_shrink.cpp @@ -0,0 +1,275 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "kernel_operator.h" +#include "types.h" + +template +class SGMVShrink { +public: + using X_T = scalar_t; + using W_T = scalar_t; + using Y_T = float; + + static constexpr uint64_t BUFFER_NUM = 1; + static constexpr uint64_t TILE_LENGTH = 11776; // optimal performance tile length + +public: + __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {} + __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, uint32_t loraIndicesSize, + __gm__ void *seqLen, uint32_t seqLenSize, + __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) + { + batchSize_ = batchSize; + numTokensPerCore_ = numTokensPerCore; + inputHiddenDim_ = inputHiddenDim; + maxLoRARank_ = maxLoRARank; + scale_ = scale; + singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_; + incremental_ = inputHiddenDim_ > TILE_LENGTH; + + xGm_.SetGlobalBuffer((__gm__ X_T *)x); + yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y); + wGm_.SetGlobalBuffer((__gm__ W_T *)weight); + loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize); + seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize); + + pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T)); + pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T)); + pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float)); + pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float)); + + pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T)); + pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float)); + } + + __aicore__ inline void Process() + { + int64_t blockIdx = AscendC::GetBlockIdx(); + int64_t startIdx = blockIdx * numTokensPerCore_; + int64_t endIdx = startIdx + numTokensPerCore_; + if (endIdx > batchSize_) { + endIdx = batchSize_; + } + for (int64_t idx = startIdx; idx < endIdx; idx++) { + // set up LoRA index + CopyInIndex(idx); + if (reqLoRAIndex_ < 0) { + continue; + } + reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_; + + if (incremental_) { + ProcessImpl(idx); + } else { + ProcessImpl(idx); + } + + ScaleOutput(); + CopyOut(idx); + } + } + +private: + template + __aicore__ inline void ProcessImpl(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outBufferY_.Get(); + if constexpr (!INCREMENTAL_MODE) { + CopyInX(idx, 0, inputHiddenDim_); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + } + + for (int i = 0; i < maxLoRARank_; i++) { + float acc(0); + for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) { + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, j); + } + CopyInW(i, j); + Compute(acc); + } + CopyAndComputeLastIteration(idx, i, acc); + yOutLocal.SetValue(i, acc); + } + } + + __aicore__ inline void CopyInIndex(const int64_t idx) + { + // look up the LoRA index + int64_t weightIdx = idx; + uint64_t i = 0; + for (; i < seqLenGm_.GetSize(); i++) { + int64_t repeatValue = seqLenGm_.GetValue(i); + if (weightIdx >= repeatValue) { + weightIdx -= repeatValue; + continue; + } + break; + } + reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1; + } + + __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor xLocal = inQueueX_.AllocTensor(); + DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements); + inQueueX_.EnQue(xLocal); + } + + __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.AllocTensor(); + DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements); + inQueueW_.EnQue(wLocal); + } + + template + __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH) + { + AscendC::LocalTensor wLocal = inQueueW_.DeQue(); + AscendC::LocalTensor xTmpTensor = tmpBufferX_.Get(); + AscendC::LocalTensor wTmpTensor = tmpBufferW_.Get(); + + if constexpr (INCREMENTAL_MODE) { + AscendC::LocalTensor xLocal = inQueueX_.DeQue(); + Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements); + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueX_.FreeTensor(xLocal); + inQueueW_.FreeTensor(wLocal); + } else { + Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements); + pipe_barrier(PIPE_V); + inQueueW_.FreeTensor(wLocal); + } + // dot product of the one tile of X and W + Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + // reduce sum generate one number, which is the summation of all the dot product + ReduceSum(wTmpTensor, wTmpTensor, wTmpTensor, numElements); + pipe_barrier(PIPE_V); + + acc += wTmpTensor.GetValue(0); + } + + template + __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc) + { + int32_t colIdx = inputHiddenDim_ / TILE_LENGTH; + int32_t remaining = inputHiddenDim_ % TILE_LENGTH; + if (remaining == 0) { + return; + } + if constexpr (INCREMENTAL_MODE) { + CopyInX(idx, colIdx, remaining); + } + CopyInW(rowIdx, colIdx, remaining); + Compute(acc, remaining); + } + + __aicore__ inline void ScaleOutput() + { + AscendC::LocalTensor yLocal = outBufferY_.Get(); + AscendC::LocalTensor yOutLocal = outQueueY_.AllocTensor(); + + Muls(yOutLocal, yLocal, scale_, maxLoRARank_); + pipe_barrier(PIPE_V); + + outQueueY_.EnQue(yOutLocal); + } + + __aicore__ inline void CopyOut(const int64_t idx) + { + AscendC::LocalTensor yOutLocal = outQueueY_.DeQue(); + DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_); + outQueueY_.FreeTensor(yOutLocal); + } + +private: + AscendC::TPipe *pipe_; + AscendC::TQue inQueueX_, inQueueW_; + AscendC::TQue outQueueY_; + AscendC::TBuf tmpBufferX_, tmpBufferW_, outBufferY_; + AscendC::GlobalTensor xGm_; + AscendC::GlobalTensor wGm_; + AscendC::GlobalTensor loraIndicesGm_; + AscendC::GlobalTensor seqLenGm_; + AscendC::GlobalTensor yOutGm_; + uint32_t batchSize_; + uint32_t numTokensPerCore_; + uint32_t inputHiddenDim_; + uint32_t maxLoRARank_; + float scale_; + uint32_t singleLoRAWeightLen_; + int64_t reqLoRAIndex_; + uint64_t reqLoRAWeightOffset_; + bool incremental_; +}; + +#define SGMV_SHRINK_TYPE_DECLARE(TYPE) \ + extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, \ + __gm__ void* loraIndices, uint32_t loraIndicesSize, \ + __gm__ void* seqLen, uint32_t seqLenSize, \ + __gm__ void* y, uint32_t batchSize, \ + uint32_t numTokensPerCore, uint32_t inputHiddenDim, \ + uint32_t maxLoRARank, float scale) \ + { \ + AscendC::TPipe pipe; \ + SGMVShrink op(&pipe); \ + op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, \ + y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale); \ + op.Process(); \ + } + +// declare all dtype kernel +SGMV_SHRINK_TYPE_DECLARE(half) +#if (__CCE_AICORE__ >= 220) + SGMV_SHRINK_TYPE_DECLARE(bfloat16_t) +#endif + +namespace vllm_ascend { +extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, + void* loraIndices, uint32_t loraIndicesSize, + void* seqLen, uint32_t seqLenSize, + void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim, + uint32_t maxLoRARank, float scale) +{ + uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore; + if (type == AscendType::FP16) { + sgmv_shrink_half<<>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, + y, batchSize, + numTokensPerCore, inputHiddenDim, maxLoRARank, + scale); + } else if (type == AscendType::BF16) { + #if (__CCE_AICORE__ >= 220) + sgmv_shrink_bfloat16_t<<>>(x, weight, loraIndices, loraIndicesSize, + seqLen, seqLenSize, + y, batchSize, + numTokensPerCore, inputHiddenDim, maxLoRARank, + scale); + #endif + } else { + return; + } +} + +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/kernels/types.h b/csrc/kernels/types.h new file mode 100644 index 0000000..7072e8c --- /dev/null +++ b/csrc/kernels/types.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace vllm_ascend { +enum struct AscendType { + FP16 = 0, + BF16 = 1, + FP32 = 2, +}; +} \ No newline at end of file diff --git a/csrc/kernels/utils.h b/csrc/kernels/utils.h new file mode 100644 index 0000000..c2d4261 --- /dev/null +++ b/csrc/kernels/utils.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include "kernel_type.h" +namespace vllm_ascend { + +template struct AccType; + +#if (__CCE_AICORE__ >= 220) +template <> struct AccType { + using type = float; +}; +#endif + +template <> struct AccType { + using type = half; +}; + +template <> struct AccType { + using type = float; +}; + +template <> struct AccType { + using type = int; +}; + +template +__aicore__ inline void local_mem_copy(AscendC::LocalTensor dst, AscendC::LocalTensor src, int size) +{ + constexpr int loadSize = 256 / sizeof(scalar_t); + int loopCnt = size / loadSize; + int tailSize = size % loadSize; + if (loopCnt) + AscendC::Copy(dst, src, loadSize, loopCnt, {1, 1, 8, 8}); + AscendC::Copy(dst[loopCnt * loadSize], src[loopCnt * loadSize], tailSize, 1, {1, 1, 8, 8}); +} +} // namespace vllm_ascend \ No newline at end of file diff --git a/csrc/ops.h b/csrc/ops.h new file mode 100644 index 0000000..4773992 --- /dev/null +++ b/csrc/ops.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include "kernels/types.h" +#include "torch_npu/csrc/aten/common/from_blob.h" + +namespace vllm_ascend { + extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst, + void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim, + const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride, + const int64_t dstKeyStride, const int numHeads, const int numKvHeads, + const int headSize, const int64_t numTokens, const uint32_t loopCnt, + uint32_t aivNum); + + extern void get_masked_input_and_mask_impl( + void* stream, + void* input, + void* masked_input, + void* mask_out, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index, + const int64_t size, + const uint32_t loop_cnt, + const uint32_t aiv_num); + + torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { + if (!tensor.is_privateuseone()) { + throw std::runtime_error("Tensor must be on NPU device"); + } + // Get the raw data pointer + void* data_ptr = tensor.data_ptr(); + // Get tensor sizes and strides + std::vector sizes = tensor.sizes().vec(); + std::vector strides = tensor.strides().vec(); + // Get tensor options (dtype, device) + auto options = tensor.options(); + // Create a new tensor from the raw data pointer + auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options); + return new_tensor; + } + + extern void bgmv_shrink_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *indices, + uint32_t indicesSize, + void *y, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t input_hidden_dim, + uint32_t lora_rank, + float scale); + + extern void bgmv_expand_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *indices, + uint32_t indicesSize, + void *y, + void *y_out, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t lora_rank, + uint32_t output_hidden_dim, + uint32_t slice_offset, + uint32_t output_full_dim); + + extern void sgmv_shrink_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *loraIndices, + uint32_t loraIndicesSize, + void *seqLen, + uint32_t seqLenSize, + void *y, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t input_hidden_dim, + uint32_t lora_rank, + float scale); + + extern void sgmv_expand_impl( + AscendType type, + void *stream, + void *x, + void *weight, + void *loraIndices, + uint32_t loraIndicesSize, + void *seqLen, + uint32_t seqLenSize, + void *y, + void *y_out, + uint32_t batch_size, + uint32_t num_tokens_per_core, + uint32_t lora_rank, + uint32_t output_hidden_dim, + uint32_t slice_offset, + uint32_t output_full_dim); +} diff --git a/csrc/torch_binding.cpp b/csrc/torch_binding.cpp new file mode 100644 index 0000000..375ef59 --- /dev/null +++ b/csrc/torch_binding.cpp @@ -0,0 +1,428 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "ops.h" +#include "utils.h" + +namespace vllm_ascend { + +AscendType get_dtype_from_torch(at::ScalarType scalarType) +{ + if (scalarType == at::ScalarType::Float) { + return AscendType::FP32; + } else if (scalarType == at::ScalarType::BFloat16) { + return AscendType::BF16; + } else { + return AscendType::FP16; + } +} + +std::tuple rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key, + int64_t head_size, at::Tensor &cos_sin_cache, bool is_neox) +{ + int32_t deviceId = 0; + int64_t num_tokens = positions.numel(); + int positions_ndim = positions.dim(); + TORCH_CHECK( + positions_ndim == 1 || positions_ndim == 2, + "positions must have shape [num_tokens] or [batch_size, seq_len]"); + if (positions_ndim == 1) { + TORCH_CHECK( + query.size(0) == positions.size(0) && key.size(0) == positions.size(0), + "query, key and positions must have the same number of tokens"); + } + if (positions_ndim == 2) { + TORCH_CHECK( + query.size(0) == positions.size(0) && + key.size(0) == positions.size(0) && + query.size(1) == positions.size(1) && + key.size(1) == positions.size(1), + "query, key and positions must have the same batch_size and seq_len"); + } + TORCH_CHECK(head_size % 32 == 0, "rotary_embedding: headSize should be divisible by 32"); + int query_hidden_size = query.numel() / num_tokens; + int key_hidden_size = key.numel() / num_tokens; + TORCH_CHECK(query_hidden_size % head_size == 0); + TORCH_CHECK(key_hidden_size % head_size == 0); + TORCH_CHECK(is_neox == true, "rotary_embedding: neox=false is not supported as custom kernel in vllm-ascend"); + + // Make sure query and key have consistent number of heads + int num_heads = query_hidden_size / head_size; + int num_kv_heads = key_hidden_size / head_size; + TORCH_CHECK(num_heads % num_kv_heads == 0); + at::Tensor query_dst = at::empty({num_tokens, num_heads, head_size}, query.options()); + at::Tensor key_dst = at::empty({num_tokens, num_kv_heads, head_size}, key.options()); + + int rot_dim = cos_sin_cache.size(1); + int seq_dim_idx = positions_ndim - 1; + int64_t *position_ids_ptr = positions.data_ptr(); + void *query_dst_ptr = query_dst.data_ptr(); + void *key_dst_ptr = key_dst.data_ptr(); + void *query_ptr = query.data_ptr(); + void *key_ptr = key.data_ptr(); + void *cos_sin_cache_ptr = cos_sin_cache.data_ptr(); + int64_t query_stride = query.stride(seq_dim_idx); + int64_t key_stride = key.stride(seq_dim_idx); + int64_t dst_query_stride = query_dst.stride(0); + int64_t dst_key_stride = key_dst.stride(0); + at::ScalarType scalar_type = query.scalar_type(); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("rotary_embedding"); + cmd.SetCustomHandler([scalar_type, is_neox, num_tokens, stream, position_ids_ptr, query_dst_ptr, key_dst_ptr, + query_ptr, key_ptr, cos_sin_cache_ptr, rot_dim, query_stride, key_stride, + dst_query_stride, dst_key_stride, num_heads, num_kv_heads, head_size]() -> int { + auto dtype_num = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + uint32_t loop_cnt = (num_tokens + aiv_num - 1) / aiv_num; + rotary_embedding_impl(dtype_num, is_neox, stream, position_ids_ptr, query_dst_ptr, key_dst_ptr, query_ptr, + key_ptr, cos_sin_cache_ptr, rot_dim, query_stride, key_stride, dst_query_stride, + dst_key_stride, num_heads, num_kv_heads, head_size, num_tokens, loop_cnt, aiv_num); + return 0; + }); + cmd.Run(); + return {query_dst, key_dst}; +} + +std::tuple get_masked_input_and_mask( + at::Tensor &input, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index) + /* + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198 + Embedding parallelized in the vocabulary dimension. + + Adapted from torch.nn.Embedding, note that we pad the vocabulary size to + make sure it is divisible by the number of model parallel GPUs. + + In order to support various loading methods, we ensure that LoRA-added + embeddings are always at the end of TP-sharded tensors. In other words, + we shard base embeddings and LoRA embeddings separately (both padded), + and place them in the same tensor. + In this example, we will have the original vocab size = 1010, + added vocab size = 16 and padding to 64. Therefore, the total + vocab size with padding will be 1088 (because we first pad 1010 to + 1024, add 16, and then pad to 1088). + Therefore, the tensor format looks like the following: + TP1, rank 0 (no sharding): + |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >| + corresponding token_id: | 0 | 1 | ... | 1009 | -1 | ... | -1 | 1010 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 | + + TP2, rank 0: + |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >| + corresponding token_id: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 1000 | ... | 1015 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 527 | 520 | ... | 543 | + TP2, rank 1: + |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >| + corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 | + index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 | + Parameters: + org_vocab_start_index //base embeddings start + org_vocab_end_index //base embeddings end + num_org_vocab_padding //base embeddings padding + added_vocab_start_index //LoRA embeddings start + added_vocab_end_index //LoRA embeddings end + */ +{ + // Input validation + TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension"); + TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative"); + TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index"); + TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative"); + TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index"); + TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index"); + + // Get total number of elements + int64_t size = input.numel(); + + // Create output tensors + at::Tensor masked_input = at::empty_like(input); + at::Tensor mask = at::empty_like(input).to(at::kBool); + + // Get data pointers + void *input_ptr = input.data_ptr(); + void *masked_input_ptr = masked_input.data_ptr(); + void *mask_ptr = mask.data_ptr(); + + // Get current stream + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + + // Get scalar type + at::ScalarType scalar_type = input.scalar_type(); + + // Create and configure OpCommand + at_npu::native::OpCommand cmd; + cmd.Name("get_masked_input_and_mask"); + cmd.SetCustomHandler([scalar_type, size, stream, + input_ptr, masked_input_ptr, mask_ptr, + org_vocab_start_index, org_vocab_end_index, + num_org_vocab_padding, added_vocab_start_index, + added_vocab_end_index]() -> int { + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + uint32_t loop_cnt = (size + aiv_num - 1) / aiv_num; + + // Call implementation + get_masked_input_and_mask_impl( + stream, + input_ptr, + masked_input_ptr, + mask_ptr, + org_vocab_start_index, + org_vocab_end_index, + num_org_vocab_padding, + added_vocab_start_index, + added_vocab_end_index, + size, + loop_cnt, + aiv_num); + + return 0; + }); + cmd.Run(); + return {masked_input, mask}; +} + +void bgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, double scale) +{ + at::ScalarType scalar_type = x.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]"); + TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0), + "the first dimension of x, y, indices should be same"); + TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out"); + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* indices_ptr = indices.data_ptr(); + int indices_size = indices.size(0); + void* y_ptr = y.data_ptr(); + int batch_size = x.size(0); + int input_hidden_token = x.size(1); + uint32_t lora_rank = y.size(1); + float scale_f = static_cast(scale); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("bgmv_shrink"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, batch_size, input_hidden_token, + lora_rank, scale_f]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + bgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, batch_size, num_tokens_per_core, + input_hidden_token, lora_rank, scale_f); + return 0; + }); + cmd.Run(); + return; +} + +at::Tensor bgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, + int64_t slice_offset, int64_t slice_size) +{ + at::ScalarType scalar_type = y.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]"); + TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0), + "the first dimension of x, y, indices should be same"); + TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out"); + TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0"); + TORCH_CHECK((slice_size + slice_offset) <= y.size(1), + "slice_size + slice_offset should be smaller than the second dimension of y") + + at::Tensor y_out = y; + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* indices_ptr = indices.data_ptr(); + int indices_size = indices.size(0); + void* y_ptr = y.data_ptr(); + void* y_out_ptr = y_out.data_ptr(); + int batch_size = x.size(0); + int lora_rank = x.size(1); + int output_full_dim = y.size(1); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("bgmv_expand"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, y_out_ptr, batch_size, lora_rank, + slice_offset, slice_size, output_full_dim]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + bgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, y_out_ptr, batch_size, + num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim); + return 0; + }); + cmd.Run(); + return y_out; +} + +void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, double scale) +{ + at::ScalarType scalar_type = x.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out"); + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* lora_indices_ptr = lora_indices.data_ptr(); + void* seq_len_ptr = seq_len.data_ptr(); + int lora_indices_size = lora_indices.size(0); + int seq_len_size = seq_len.size(0); + void* y_ptr = y.data_ptr(); + int batch_size = x.size(0); + int input_hidden_token = x.size(1); + uint32_t lora_rank = y.size(1); + float scale_f = static_cast(scale); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("sgmv_shrink"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, + seq_len_ptr, seq_len_size, y_ptr, + batch_size, input_hidden_token, lora_rank, scale_f]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, + y_ptr, batch_size, + num_tokens_per_core, input_hidden_token, lora_rank, scale_f); + return 0; + }); + cmd.Run(); + return; +} + +at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, int64_t slice_offset, int64_t slice_size) +{ + at::ScalarType scalar_type = y.scalar_type(); + TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16"); + TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]"); + TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4, + "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]"); + TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]"); + TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out"); + TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0"); + TORCH_CHECK((slice_size + slice_offset) <= y.size(1), + "slice_size + slice_offset should be smaller than the second dimension of y") + + at::Tensor y_out = y; + void* x_ptr = x.data_ptr(); + void* weight_ptr = weight.data_ptr(); + void* lora_indices_ptr = lora_indices.data_ptr(); + void* seq_len_ptr = seq_len.data_ptr(); + int lora_indices_size = lora_indices.size(0); + int seq_len_size = seq_len.size(0); + void* y_ptr = y.data_ptr(); + void* y_out_ptr = y_out.data_ptr(); + int batch_size = x.size(0); + int lora_rank = x.size(1); + int output_full_dim = y.size(1); + aclrtStream stream = c10_npu::getCurrentNPUStream().stream(); + at_npu::native::OpCommand cmd; + cmd.Name("sgmv_expand"); + cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, + batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int { + auto dtype = get_dtype_from_torch(scalar_type); + int device_id = 0; + int64_t aiv_num = 0; + TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS); + int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num; + TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0"); + sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, + batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim); + return 0; + }); + cmd.Run(); + return y_out; +} +} // namespace vllm_ascend + +TORCH_LIBRARY_EXPAND(_C, ops) +{ + // vLLM-Ascend custom ops + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); + ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor); + + // Rotary embedding + // Apply GPT-NeoX style rotary embedding to query and key. + ops.def( + "rotary_embedding(Tensor positions, Tensor! query," + " Tensor! key, int head_size," + " Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)"); + ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding); + + ops.def( + "get_masked_input_and_mask(Tensor input, " + " int org_vocab_start_index, " + " int org_vocab_end_index, " + " int num_org_vocab_padding, " + " int added_vocab_start_index, " + " int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)"); + ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask); + + ops.def("bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y, float scale) -> ()"); + ops.impl("bgmv_shrink", torch::kPrivateUse1, &vllm_ascend::bgmv_shrink); + + ops.def( + "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y," + " int slice_offset, int slice_size) -> Tensor"); + ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand); + + ops.def("sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y, float scale) -> ()"); + ops.impl("sgmv_shrink", torch::kPrivateUse1, &vllm_ascend::sgmv_shrink); + + ops.def( + "sgmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y," + " int slice_offset, int slice_size) -> Tensor"); + ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand); +} + +REGISTER_EXTENSION(_C) diff --git a/csrc/torch_binding_meta.cpp b/csrc/torch_binding_meta.cpp new file mode 100644 index 0000000..d69254b --- /dev/null +++ b/csrc/torch_binding_meta.cpp @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include +#include "utils.h" +/* + * How to write a meta implementation for a custom operator (meta kernel): + * + * Meta implementations are used for shape and dtype inference, tracing, and export. + * They do NOT perform any real computation or allocate device memory. + * Instead, they return empty tensors with the correct shapes, dtypes, and device types. + * + * Steps to write a meta implementation: + * 1. The function signature should match the operator's schema, but only use the arguments + * necessary to infer output shapes and dtypes. + * 2. Use input tensor shapes, dtypes, and any relevant arguments to compute the output shapes. + * 3. Return empty tensors (e.g., at::empty_symint, at::empty_like) with the correct shape and dtype. + * 4. Do NOT perform any real computation or data movement. + * 5. Register the meta implementation with the "Meta" dispatch key using TORCH_LIBRARY_IMPL or similar. + * + * Example: + * std::tuple my_op_meta( + * at::Tensor &input, int64_t some_param) { + * // Infer output shape based on input and parameters + * auto out_shape = ...; + * at::Tensor out = at::empty_symint(out_shape, input.options()); + * // Return empty tensor(s) with correct shape/dtype + * return {out, ...}; + * } + * + * See below for real examples. + */ + +namespace vllm_ascend { +namespace meta { + +std::tuple rotary_embedding_meta( + at::Tensor &positions, + at::Tensor &query, + at::Tensor &key, + int64_t head_size, + at::Tensor &cos_sin_cache, + bool is_neox) { + auto num_tokens = positions.sym_numel(); + auto query_hidden_size = query.sym_numel() / num_tokens; + auto key_hidden_size = key.sym_numel() / num_tokens; + + auto num_heads = query_hidden_size / head_size; + auto num_kv_heads = key_hidden_size / head_size; + at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options()); + at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options()); + + return {query_dst, key_dst}; +} + +std::tuple get_masked_input_and_mask_meta( + at::Tensor &input, + const int64_t org_vocab_start_index, + const int64_t org_vocab_end_index, + const int64_t num_org_vocab_padding, + const int64_t added_vocab_start_index, + const int64_t added_vocab_end_index) { + + at::Tensor masked_input = at::empty_like(input); + at::Tensor mask = at::empty_like(input, input.options().dtype(at::kBool)); + + return {masked_input, mask}; +} + +at::Tensor bgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, + int64_t slice_offset, int64_t slice_size) { + at::Tensor y_out = at::empty_like(y); + return y_out; +} + +at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len, + at::Tensor &y, int64_t slice_offset, int64_t slice_size) { + at::Tensor y_out = at::empty_like(y); + return y_out; +} + + +} // namespace meta +} // namespace vllm_ascend + +namespace { + // Register the meta implementations of the custom kernels for symbolic tracing, this will also + // the custom kernel been captured into aclgraph + TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) { + // Rotary embedding meta implementation + ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta); + // Masked input and mask meta implementation + ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta); + // Bgmv expand + ops.impl("bgmv_expand", &vllm_ascend::meta::bgmv_expand_meta); + // Sgmv expand + ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta); + +} +} \ No newline at end of file diff --git a/csrc/utils.h b/csrc/utils.h new file mode 100644 index 0000000..74481e1 --- /dev/null +++ b/csrc/utils.h @@ -0,0 +1,31 @@ +#pragma once + +#include "kernels/types.h" +#include +#include + +#define _CONCAT(A, B) A##B +#define CONCAT(A, B) _CONCAT(A, B) + +#define _STRINGIFY(A) #A +#define STRINGIFY(A) _STRINGIFY(A) + +// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME +// could be a macro instead of a literal token. +#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE) + +// A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME +// could be a macro instead of a literal token. +#define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \ + TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE) + +// REGISTER_EXTENSION allows the shared library to be loaded and initialized +// via python's import statement. +#define REGISTER_EXTENSION(NAME) \ + PyMODINIT_FUNC CONCAT(PyInit_, NAME)() { \ + static struct PyModuleDef module = {PyModuleDef_HEAD_INIT, \ + STRINGIFY(NAME), nullptr, 0, nullptr}; \ + return PyModule_Create(&module); \ + } + + diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..fe062fc --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,25 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +intl: + sphinx-intl build + @$(SPHINXBUILD) -b html -D language=zh_CN "$(SOURCEDIR)" "$(BUILDDIR)/html/zh-cn" $(SPHINXOPTS) $(O) + diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..68edc4d --- /dev/null +++ b/docs/README.md @@ -0,0 +1,24 @@ +# vLLM Ascend Plugin documents + +Live doc: https://vllm-ascend.readthedocs.io + +## Build the docs + +```bash +# Install dependencies. +pip install -r requirements-docs.txt + +# Build the docs. +make clean +make html + +# Build the docs with translation +make intl + +# Open the docs with your browser +python -m http.server -d _build/html/ +``` + +Launch your browser and open: +- English version: http://localhost:8000 +- Chinese version: http://localhost:8000/zh_CN diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt new file mode 100644 index 0000000..b8d3ac7 --- /dev/null +++ b/docs/requirements-docs.txt @@ -0,0 +1,10 @@ +sphinx +sphinx-argparse +sphinx-book-theme +sphinx-copybutton +sphinx-design +sphinx-togglebutton +myst-parser +msgspec +sphinx-substitution-extensions +sphinx-intl \ No newline at end of file diff --git a/docs/requirements-test.txt b/docs/requirements-test.txt new file mode 100644 index 0000000..c63a6da --- /dev/null +++ b/docs/requirements-test.txt @@ -0,0 +1,2 @@ +pytest-asyncio +pytest-mock diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html new file mode 100644 index 0000000..9cea301 --- /dev/null +++ b/docs/source/_templates/sections/header.html @@ -0,0 +1,58 @@ + + + +
+

You are viewing the latest developer preview docs. Click here to view docs for the latest stable release(v0.9.1).

+
\ No newline at end of file diff --git a/docs/source/assets/multi_node_dp_deepseek.png b/docs/source/assets/multi_node_dp_deepseek.png new file mode 100644 index 0000000..1809dc3 Binary files /dev/null and b/docs/source/assets/multi_node_dp_deepseek.png differ diff --git a/docs/source/assets/multi_node_dp_kimi.png b/docs/source/assets/multi_node_dp_kimi.png new file mode 100644 index 0000000..a4ffc08 Binary files /dev/null and b/docs/source/assets/multi_node_dp_kimi.png differ diff --git a/docs/source/community/contributors.md b/docs/source/community/contributors.md new file mode 100644 index 0000000..c1ebe9f --- /dev/null +++ b/docs/source/community/contributors.md @@ -0,0 +1,138 @@ +# Maintainers and contributors + +## Maintainers + +| Name | Github ID | Date | +|:-----------:|:-----:|:-----:| +| Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 | +| Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 | +| Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 | +| Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 | +| Wengang Chen | [@ApsarasX](https://github.com/ApsarasX) | 2025/08 | +| Mengqing Cao | [@MengqingCao](https://github.com/MengqingCao) | 2025/08 | + +## Contributors + +vLLM Ascend every release would not have been possible without the following contributors: + +Updated on 2025-09-03: + +| Number | Contributor | Date | Commit ID | +|:------:|:-----------:|:-----:|:---------:| +| 117 | [@panchao-hub](https://github.com/panchao-hub) | 2025/8/30 | [7215454](https://github.com/vllm-project/vllm-ascend/commit/7215454de6df78f4f9a49a99c5739f8bb360f5bc) | +| 116 | [@lidenghui1110](https://github.com/lidenghui1110) | 2025/8/29 | [600b08f](https://github.com/vllm-project/vllm-ascend/commit/600b08f7542be3409c2c70927c91471e8de33d03) | +| 115 | [@NSDie](https://github.com/NSDie) | 2025/8/28 | [1191a64](https://github.com/vllm-project/vllm-ascend/commit/1191a64ae508183d5613711bc98a90250963f83a) | +| 114 | [@s-jiayang](https://github.com/s-jiayang) | 2025/8/27 | [6a4ec18](https://github.com/vllm-project/vllm-ascend/commit/6a4ec186e731b9516235f4fd30b5b98227513fe7) | +| 113 | [@LookAround0301](https://github.com/LookAround0301) | 2025/8/22 | [e9fb895](https://github.com/vllm-project/vllm-ascend/commit/e9fb895b10cef37ea634f4d4af71686b09ca9f20) | +| 112 | [@ZhaoJiangJiang](https://github.com/ZhaoJiangJiang) | 2025/8/22 | [3629bc4](https://github.com/vllm-project/vllm-ascend/commit/3629bc4431d3edb4224761f9036b3bddb16158d6) | +| 111 | [@NicholasTao](https://github.com/NicholasTao) | 2025/8/20 | [7bec1a9](https://github.com/vllm-project/vllm-ascend/commit/7bec1a9b9c372785551d45682bf11063ec42b216) | +| 110 | [@gameofdimension](https://github.com/gameofdimension) | 2025/8/19 | [27d038d](https://github.com/vllm-project/vllm-ascend/commit/27d038dc663bf550a35a8f15659493b2abefda07) | +| 109 | [@liuchenbing](https://github.com/liuchenbing) | 2025/8/19 | [3648d18](https://github.com/vllm-project/vllm-ascend/commit/3648d18e673f15a33a82d6ea95d3a9dd891ff1f5) | +| 108 | [@LCAIZJ](https://github.com/LCAIZJ) | 2025/8/18 | [03ca2b2](https://github.com/vllm-project/vllm-ascend/commit/03ca2b26ca9ab6b9a12f021b0595a726ee35e223) | +| 107 | [@haojiangzheng](https://github.com/haojiangzheng) | 2025/8/11 | [eb43a47](https://github.com/vllm-project/vllm-ascend/commit/eb43a475f429192e7509e85e28b1c65d5097f373) | +| 106 | [@QwertyJack](https://github.com/QwertyJack) | 2025/8/11 | [9c6d108](https://github.com/vllm-project/vllm-ascend/commit/9c6d108330574176f79eea52f989ea6049336af8) | +| 105 | [@SlightwindSec](https://github.com/SlightwindSec) | 2025/8/5 | [f3b50c5](https://github.com/vllm-project/vllm-ascend/commit/f3b50c54e8243ad8ccefb9b033277fbdd382a9c4) | +| 104 | [@CaveNightingale](https://github.com/CaveNightingale) | 2025/8/4 | [957c7f1](https://github.com/vllm-project/vllm-ascend/commit/957c7f108d5f0aea230220ccdc18d657229e4030) | +| 103 | [@underfituu](https://github.com/underfituu) | 2025/8/4 | [e38fab0](https://github.com/vllm-project/vllm-ascend/commit/e38fab011d0b81f3a8e40d9bbe263c283dd4129b) | +| 102 | [@yangqinghao-cmss](https://github.com/yangqinghao-cmss) | 2025/8/1 | [99fa0ac](https://github.com/vllm-project/vllm-ascend/commit/99fa0ac882c79ae9282940125b042a44ea422757) | +| 101 | [@pjgao](https://github.com/pjgao) | 2025/7/31 | [6192bc9](https://github.com/vllm-project/vllm-ascend/commit/6192bc95c0e47097836e9be1f30f2a0a6fdca088) | +| 100 | [@Liccol](https://github.com/Liccol) | 2025/7/31 | [7c90ba5](https://github.com/vllm-project/vllm-ascend/commit/7c90ba5fe8e420b891fdd30df050a33e3767835d) | +| 99 | [@1024daniel](https://github.com/1024daniel) | 2025/7/31 | [db310c6](https://github.com/vllm-project/vllm-ascend/commit/db310c6ec97b056296f7c2348b90c1d96d0b562a) | +| 98 | [@zhoux77899](https://github.com/zhoux77899) | 2025/7/30 | [4fcca13](https://github.com/vllm-project/vllm-ascend/commit/4fcca137a70c11daa4070ae014288be154715939) | +| 97 | [@YuanCheng-coder](https://github.com/YuanCheng-coder) | 2025/7/30 | [34dd24a](https://github.com/vllm-project/vllm-ascend/commit/34dd24adf21fb85a2c413292754b1599832efae2) | +| 96 | [@hongfugui](https://github.com/hongfugui) | 2025/7/30 | [1dbb888](https://github.com/vllm-project/vllm-ascend/commit/1dbb8882759e4326f5706f6e610674423376c2f3) | +| 95 | [@Irving11-BKN](https://github.com/Irving11-BKN) | 2025/7/29 | [ca8007f](https://github.com/vllm-project/vllm-ascend/commit/ca8007f584141d3a59b2bcbd4f8ba269c9b7e252) | +| 94 | [@taoxudonghaha](https://github.com/taoxudonghaha) | 2025/7/29 | [540336e](https://github.com/vllm-project/vllm-ascend/commit/540336edc9db09072a9aaa486fbf7ce625da5b9e) | +| 93 | [@loukong33](https://github.com/loukong33) | 2025/7/28 | [1a25b0a](https://github.com/vllm-project/vllm-ascend/commit/1a25b0a2ddb23bf4d731ebac4503efaf237b191f) | +| 92 | [@Ronald1995](https://github.com/Ronald1995) | 2025/7/25 | [e561a2c](https://github.com/vllm-project/vllm-ascend/commit/e561a2c6ec4493b490b13a4a9007d8f451ae0d0f) | +| 91 | [@ZrBac](https://github.com/ZrBac) | 2025/7/24 | [2ffe051](https://github.com/vllm-project/vllm-ascend/commit/2ffe051859d585df8353d1b9eefb64c44078175a) | +| 90 | [@SunnyLee151064](https://github.com/SunnyLee151064) | 2025/7/24 | [34571ea](https://github.com/vllm-project/vllm-ascend/commit/34571ea5ae69529758edf75f0252f86ccb4c7184) | +| 89 | [@shiyuan680](https://github.com/shiyuan680) | 2025/7/23 | [ac0bf13](https://github.com/vllm-project/vllm-ascend/commit/ac0bf133f47ead20f18bf71f9be6dbe05fbd218f) | +| 88 | [@aidoczh](https://github.com/aidoczh) | 2025/7/21 | [c32eea9](https://github.com/vllm-project/vllm-ascend/commit/c32eea96b73d26268070f57ef98416decc98aff7) | +| 87 | [@nuclearwu](https://github.com/nuclearwu) | 2025/7/20 | [54f2b31](https://github.com/vllm-project/vllm-ascend/commit/54f2b311848badc86371d269140e729012a60f2c) | +| 86 | [@pkking](https://github.com/pkking) | 2025/7/18 | [3e39d72](https://github.com/vllm-project/vllm-ascend/commit/3e39d7234c0e5c66b184c136c602e87272b5a36e) | +| 85 | [@lianyiibo](https://github.com/lianyiibo) | 2025/7/18 | [53d2ea3](https://github.com/vllm-project/vllm-ascend/commit/53d2ea3789ffce32bf3ceb055d5582d28eadc6c7) | +| 84 | [@xudongLi-cmss](https://github.com/xudongLi-cmss) | 2025/7/2 | [7fc1a98](https://github.com/vllm-project/vllm-ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576) | +| 83 | [@ZhengWG](https://github.com/) | 2025/7/7 | [3a469de](https://github.com/vllm-project/vllm-ascend/commit/9c886d0a1f0fc011692090b0395d734c83a469de) | +| 82 | [@wm901115nwpu](https://github.com/) | 2025/7/7 | [a2a47d4](https://github.com/vllm-project/vllm-ascend/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4) | +| 81 | [@Agonixiaoxiao](https://github.com/) | 2025/7/2 | [6f84576](https://github.com/vllm-project/vllm-ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576) | +| 80 | [@zhanghw0354](https://github.com/zhanghw0354) | 2025/7/2 | [d3df9a5](https://github.com/vllm-project/vllm-ascend/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5) | +| 79 | [@GDzhu01](https://github.com/GDzhu01) | 2025/6/28 | [de256ac](https://github.com/vllm-project/vllm-ascend/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac) | +| 78 | [@leo-pony](https://github.com/leo-pony) | 2025/6/26 | [3f2a5f2](https://github.com/vllm-project/vllm-ascend/commit/10253449120307e3b45f99d82218ba53e3f2a5f2) | +| 77 | [@zeshengzong](https://github.com/zeshengzong) | 2025/6/26 | [3ee25aa](https://github.com/vllm-project/vllm-ascend/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa) | +| 76 | [@sharonyunyun](https://github.com/sharonyunyun) | 2025/6/25 | [2dd8666](https://github.com/vllm-project/vllm-ascend/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666) | +| 75 | [@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee) | 2025/6/25 | [c65dd40](https://github.com/vllm-project/vllm-ascend/commit/2fda60464c287fe456b4a2f27e63996edc65dd40) | +| 74 | [@xleoken](https://github.com/xleoken) | 2025/6/23 | [c604de0](https://github.com/vllm-project/vllm-ascend/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0) | +| 73 | [@lyj-jjj](https://github.com/lyj-jjj) | 2025/6/23 | [5cbd74e](https://github.com/vllm-project/vllm-ascend/commit/5177bef87a21331dcca11159d3d1438075cbd74e) | +| 72 | [@farawayboat](https://github.com/farawayboat)| 2025/6/21 | [bc7d392](https://github.com/vllm-project/vllm-ascend/commit/097e7149f75c0806774bc68207f0f6270bc7d392) +| 71 | [@yuancaoyaoHW](https://github.com/yuancaoyaoHW) | 2025/6/20 | [7aa0b94](https://github.com/vllm-project/vllm-ascend/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94) +| 70 | [@songshanhu07](https://github.com/songshanhu07) | 2025/6/18 | [5e1de1f](https://github.com/vllm-project/vllm-ascend/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f) +| 69 | [@wangyanhui-cmss](https://github.com/wangyanhui-cmss) | 2025/6/12| [40c9e88](https://github.com/vllm-project/vllm-ascend/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88) | +| 68 | [@chenwaner](https://github.com/chenwaner) | 2025/6/11 | [c696169](https://github.com/vllm-project/vllm-ascend/commit/e46dc142bf1180453c64226d76854fc1ec696169) | +| 67 | [@yzim](https://github.com/yzim) | 2025/6/11 | [aaf701b](https://github.com/vllm-project/vllm-ascend/commit/4153a5091b698c2270d160409e7fee73baaf701b) | +| 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) | +| 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) | +| 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) | +| 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) | +| 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) | +| 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) | +| 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) | +| 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) | +| 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) | +| 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) | +| 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) | +| 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) | +| 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) | +| 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) | +| 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) | +| 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) | +| 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) | +| 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) | +| 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) | +| 47 | [@NeverRaR](https://github.com/NeverRaR) | 2025/5/12 | [efabd72](https://github.com/vllm-project/vllm-ascend/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1) | +| 46 | [@chris668899](https://github.com/chris668899) | 2025/5/8 | [6c02088](https://github.com/vllm-project/vllm-ascend/commit/6c020883a8332b5c519f4f6502733edd9b391c2b) | +| 45 | [@sunbaosong](https://github.com/sunbaosong) | 2025/5/6 | [d6bfae8](https://github.com/vllm-project/vllm-ascend/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4) | +| 44 | [@ApsarasX](https://github.com/ApsarasX) | 2025/4/29 | [87975fa](https://github.com/vllm-project/vllm-ascend/commit/87975fa058fe3f90d204ded42a08989a8dcb413e) | +| 43 | [@zouyida2052](https://github.com/zouyida2052) | 2025/4/28 | [b9528e6](https://github.com/vllm-project/vllm-ascend/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b) | +| 42 | [@ZhengJun9](https://github.com/ZhengJun9) | 2025/4/28 | [1791113](https://github.com/vllm-project/vllm-ascend/commit/17911138c90d78a76bd691e9dcb56763db35b19f) | +| 41 | [@linfeng-yuan](https://github.com/linfeng-yuan) | 2025/4/28 | [2204e4d](https://github.com/vllm-project/vllm-ascend/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd) | +| 40 | [@jianzs](https://github.com/jianzs) | 2025/4/27 | [fa4a5d9](https://github.com/vllm-project/vllm-ascend/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5) | +| 39 | [@fakeYan](https://github.com/fakeYan) | 2025/4/23 | [05bdcbe](https://github.com/vllm-project/vllm-ascend/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421) | +| 38 | [@RongRongStudio](https://github.com/RongRongStudio) | 2025/4/22 | [848e041](https://github.com/vllm-project/vllm-ascend/commit/848e041a54732c923660dd02daf8e9bf439736a2) | +| 37 | [@paulyu12](https://github.com/paulyu12) | 2025/4/17 | [697908f](https://github.com/vllm-project/vllm-ascend/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e) | +| 36 | [@heartStrive1998](https://github.com/heartStrive1998) | 2025/4/16 | [2f15503](https://github.com/vllm-project/vllm-ascend/commit/2f155039dc3997640854daef469bbf0cb77dc6ed) | +| 35 | [@eeethenQ](https://github.com/eeethenQ) | 2025/4/15 | [44a8301](https://github.com/vllm-project/vllm-ascend/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15) | +| 34 | [@wxsIcey](https://github.com/wxsIcey) | 2025/4/10 | [d05ea17](https://github.com/vllm-project/vllm-ascend/commit/d05ea17427b82a506b97409a7de8359f18f565f7) | +| 33 | [@yx0716](https://github.com/yx0716) | 2025/4/8 | [5d62393](https://github.com/vllm-project/vllm-ascend/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20) | +| 32 | [@celestialli](https://github.com/celestialli) | 2025/4/7 | [2b765dc](https://github.com/vllm-project/vllm-ascend/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0) | +| 31 | [@hfadzxy](https://github.com/hfadzxy) | 2025/3/30 | [7beb433](https://github.com/vllm-project/vllm-ascend/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f) | +| 30 | [@wuhuikx](https://github.com/wuhuikx) | 2025/3/28 | [57a84bb](https://github.com/vllm-project/vllm-ascend/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca) | +| 29 | [@zzzzwwjj](https://github.com/zzzzwwjj) | 2025/3/28 | [12390af](https://github.com/vllm-project/vllm-ascend/commit/12390af075962456ecc8233d8dcce7064b75f390) | +| 28 | [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/3/28 | [27e86b9](https://github.com/vllm-project/vllm-ascend/commit/27e86b993a6a810d818143ec9dbfc439a419fa77) | +| 27 | [@ZhengZhenyu](https://github.com/ZhengZhenyu) | 2025/3/26 | [0b5a964](https://github.com/vllm-project/vllm-ascend/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841) | +| 26 | [@baifanxxx](https://github.com/baifanxxx) | 2025/3/26 | [1225052](https://github.com/vllm-project/vllm-ascend/commit/122505208ff6284f409846ca7294f4a4b9883285) | +| 25 | [@rjg-lyh](https://github.com/rjg-lyh) | 2025/3/13 | [6512470](https://github.com/vllm-project/vllm-ascend/commit/65124705fb39d4cc2c94c80254421e067a82fe50) | +| 24 | [@xiemingda-1002](https://github.com/xiemingda-1002) | 2025/3/12 | [59ea23d](https://github.com/vllm-project/vllm-ascend/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5) | +| 23 | [@yiz-liu](https://github.com/yiz-liu) | 2025/3/11 | [0db6670](https://github.com/vllm-project/vllm-ascend/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca) | +| 22 | [@new-TonyWang](https://github.com/new-TonyWang) | 2025/3/11 | [dfb4e23](https://github.com/vllm-project/vllm-ascend/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e) | +| 21 | [@mengwei805](https://github.com/mengwei805) | 2025/3/6 | [8fcf3d1](https://github.com/vllm-project/vllm-ascend/commit/8fcf3d1704084626db35c5dc82ade446508598d4) | +| 20 | [@baymax591](https://github.com/baymax591) | 2025/2/28 | [e8131b9](https://github.com/vllm-project/vllm-ascend/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b) | +| 19 | [@dependabot](https://github.com/dependabot) | 2025/2/27 | [a5564ed](https://github.com/vllm-project/vllm-ascend/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c) | +| 18 | [@shink](https://github.com/shink) | 2025/2/27 | [6aed833](https://github.com/vllm-project/vllm-ascend/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb) | +| 17 | [@wwfu109](https://github.com/wwfu109) | 2025/2/27 | [b074047](https://github.com/vllm-project/vllm-ascend/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e) | +| 16 | [@kunpengW-code](https://github.com/kunpengW-code) | 2025/2/26 | [ca807ce](https://github.com/vllm-project/vllm-ascend/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45) | +| 15 | [@Yaphets24](https://github.com/Yaphets24) | 2025/2/22 | [d0b3cb4](https://github.com/vllm-project/vllm-ascend/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4) | +| 14 | [@noemotiovon](https://github.com/noemotiovon) | 2025/2/21 | [202b39a](https://github.com/vllm-project/vllm-ascend/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0) | +| 13 | [@SidaoY](https://github.com/SidaoY) | 2025/2/18 | [718c763](https://github.com/vllm-project/vllm-ascend/commit/718c7638555d12cd43ea2a9e497e185778b68595) | +| 12 | [@ShiyaNiu](https://github.com/ShiyaNiu) | 2025/2/17 | [36ea38f](https://github.com/vllm-project/vllm-ascend/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38) | +| 11 | [@ji-huazhong](https://github.com/ji-huazhong) | 2025/2/12 | [c8b57d1](https://github.com/vllm-project/vllm-ascend/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82) | +| 10 | [@Angazenn](https://github.com/Angazenn) | 2025/2/11 | [7637759](https://github.com/vllm-project/vllm-ascend/commit/7637759056028839c74960d9cfd3ce6275ee5d35) | +| 9 | [@whx-sjtu](https://github.com/whx-sjtu) | 2025/2/7 | [8fc5dc9](https://github.com/vllm-project/vllm-ascend/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e) | +| 8 | [@zouyida2002](https://github.com/zouyida2002) | 2025/2/7 | [4495fc6](https://github.com/vllm-project/vllm-ascend/commit/4495fc68389e3fb1ef14534c202948931e38446b) | +| 7 | [@hw_whx](https://github.com/hw_whx) | 2025/2/7 | [7d16772](https://github.com/vllm-project/vllm-ascend/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a) | +| 6 | [@MengqingCao](https://github.com/MengqingCao) | 2025/2/6 | [7d9ae22](https://github.com/vllm-project/vllm-ascend/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730) | +| 5 | [@Potabk](https://github.com/Potabk) | 2025/2/6 | [8cb5615](https://github.com/vllm-project/vllm-ascend/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86) | +| 4 | [@wangxiyuan](https://github.com/wangxiyuan) | 2025/2/6 | [a48b9ad](https://github.com/vllm-project/vllm-ascend/commit/a48b9addefd292af523644411d4ff4142dd4bc66) | +| 3 | [@shen-shanshan](https://github.com/shen-shanshan) | 2025/2/6 | [bfccf73](https://github.com/vllm-project/vllm-ascend/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e) | +| 2 | [@Yikun](https://github.com/Yikun) | 2025/2/5 | [d5e7756](https://github.com/vllm-project/vllm-ascend/commit/d5e7756028bd5884ade96b654555c375770a2f64) | +| 1 | [@simon-mo](https://github.com/simon-mo) | 2025/1/29 | [eb28342](https://github.com/vllm-project/vllm-ascend/commit/eb283428ddc17207b6866118f9bc15454b5b8801) | diff --git a/docs/source/community/governance.md b/docs/source/community/governance.md new file mode 100644 index 0000000..7a1ff75 --- /dev/null +++ b/docs/source/community/governance.md @@ -0,0 +1,48 @@ +# Governance + +## Mission +As a vital component of vLLM, the vLLM Ascend project is dedicated to providing an easy, fast, and cheap LLM Serving for Everyone on Ascend NPU, and to actively contribute to the enrichment of vLLM. + +## Principles +vLLM Ascend follows the vLLM community's code of conduct:[vLLM - CODE OF CONDUCT](https://github.com/vllm-project/vllm/blob/main/CODE_OF_CONDUCT.md) + +## Governance - Mechanics +vLLM Ascend is an open-source project under the vLLM community, where the authority to appoint roles is ultimately determined by the vLLM community. It adopts a hierarchical technical governance structure. + +- Contributor: + + **Responsibility:** Help new contributors on boarding, handle and respond to community questions, review RFCs, code + + **Requirements:** Complete at least 1 contribution. Contributor is someone who consistently and actively participates in a project, included but not limited to issue/review/commits/community involvement. + + Contributors will be empowered [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) Github repo `Triage` permissions (`Can read and clone this repository. Can also manage issues and pull requests`) to help community developers collaborate more efficiently. + +- Maintainer: + + **Responsibility:** Develop the project's vision and mission. Maintainers are responsible for driving the technical direction of the entire project and ensuring its overall success, possessing code merge permissions. They formulate the roadmap, review contributions from community members, continuously contribute code, and actively engage in community activities (such as regular meetings/events). + + **Requirements:** Deep understanding of ‌vLLM‌ and ‌vLLM Ascend‌ codebases, with a commitment to sustained code contributions. Competency in ‌design/development/PR review workflows‌. + - **Review Quality‌:** Actively participate in community code reviews, ensuring high-quality code integration. + - **Quality Contribution‌:** Successfully develop and deliver at least one major feature while maintaining consistent high-quality contributions. + - **Community Involvement‌:** Actively address issues, respond to forum inquiries, participate in discussions, and engage in community-driven tasks. + + Requires approval from existing Maintainers. The vLLM community has the final decision-making authority. + + Maintainer will be empowered [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) Github repo write permissions (`Can read, clone, and push to this repository. Can also manage issues and pull requests`). + +## Nominating and Removing Maintainers + +### The Principles + +- Membership in vLLM Ascend is given to individuals on merit basis after they demonstrated strong expertise of the vLLM / vLLM Ascend through contributions, reviews and discussions. + +- For membership in the maintainer group the individual has to demonstrate strong and continued alignment with the overall vLLM / vLLM Ascend principles. + +- Light criteria of moving module maintenance to ‘emeritus’ status if they don’t actively participate over long periods of time. + +- The membership is for an individual, not a company. + +### Nomination and Removal + +- Nomination: Anyone can nominate someone to become a maintainer (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info around the strength of the candidate to be a maintainer, include but not limited to review quality, quality contribution, community involvement. +- Removal: Anyone can nominate a person to be removed from maintainer position (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info, include but not limited to lack of activity, conflict with the overall direction and other information that makes them unfit to be a maintainer. diff --git a/docs/source/community/user_stories/index.md b/docs/source/community/user_stories/index.md new file mode 100644 index 0000000..1dc1e56 --- /dev/null +++ b/docs/source/community/user_stories/index.md @@ -0,0 +1,19 @@ +# User Stories + +Read case studies on how users and developers solves real, everyday problems with vLLM Ascend + +- [LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform for training and fine-tuning large language models, it supports vLLM Ascend to speed up inference since [LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x performance enhancement of inference. + +- [Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge library designed for post-training foundation models using advanced techniques like SFT, PPO and DPO, it uses vLLM Ascend since [v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to support RLHF on Ascend NPU. + +- [MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference engine acceleration plug-in library developed by Huawei on Ascend hardware, which includes self-developed large language model optimization algorithms and optimizations related to the inference engine framework. It supports vLLM Ascend since [2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0001.html). + +- [GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU cluster manager for running AI models. It supports vLLM Ascend since [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more GPUStack performance evaluation info on [link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew). + +- [verl](https://github.com/volcengine/verl) is a flexible, efficient and production-ready RL training library for large language models (LLMs), uses vLLM Ascend since [v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more info on [verl x Ascend Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html). + +:::{toctree} +:caption: More details +:maxdepth: 1 +llamafactory +::: diff --git a/docs/source/community/user_stories/llamafactory.md b/docs/source/community/user_stories/llamafactory.md new file mode 100644 index 0000000..5b82b83 --- /dev/null +++ b/docs/source/community/user_stories/llamafactory.md @@ -0,0 +1,19 @@ +# LLaMA-Factory + +**About / Introduction** + +[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use and efficient platform for training and fine-tuning large language models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally without writing any code. + +LLaMA-Facotory users need to evaluate and inference the model after fine-tuning the model. + +**The Business Challenge** + +LLaMA-Factory used transformers to perform inference on Ascend NPU, but the speed was slow. + +**Solving Challenges and Benefits with vLLM Ascend** + +With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the performance of LLaMA-Factory in the model inference stage has been significantly improved. According to the test results, the inference speed of LLaMA-Factory has been increased to 2x compared to the transformers version. + +**Learn more** + +See more about LLaMA-Factory and how it uses vLLM Ascend for inference on the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html). diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md new file mode 100644 index 0000000..a245dda --- /dev/null +++ b/docs/source/community/versioning_policy.md @@ -0,0 +1,131 @@ +# Versioning policy + +Starting with vLLM 0.7.x, the vLLM Ascend Plugin ([vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)) project follows the [PEP 440](https://peps.python.org/pep-0440/) to publish matching with vLLM ([vllm-project/vllm](https://github.com/vllm-project/vllm)). + +## vLLM Ascend Plugin versions + +Each vLLM Ascend release will be versioned: `v[major].[minor].[micro][rcN][.postN]` (such as +`v0.7.3rc1`, `v0.7.3`, `v0.7.3.post1`) + +- **Final releases**: will typically be released every **3 months**, will take the vLLM upstream release plan and Ascend software product release plan into comprehensive consideration. +- **Pre releases**: will typically be released **on demand**, ending with rcN, represents the Nth release candidate version, to support early testing by our users prior to a final release. +- **Post releases**: will typically be released **on demand** to support to address minor errors in a final release. It's different from [PEP-440 post release note](https://peps.python.org/pep-0440/#post-releases) suggestion, it will contain actual bug fixes considering that the final release version should be matched strictly with the vLLM final release version (`v[major].[minor].[micro]`). The post version has to be published as a patch version of the final release. + +For example: +- `v0.7.x`: it's the first final release to match the vLLM `v0.7.x` version. +- `v0.7.3rc1`: will be the first pre version of vLLM Ascend. +- `v0.7.3.post1`: will be the post release if the `v0.7.3` release has some minor errors. + +## Release Compatibility Matrix + +Following is the Release Compatibility Matrix for vLLM Ascend Plugin: + +| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo | +|-------------|--------------|------------------|-------------|--------------------|--------------| +| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | | +| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | | +| v0.9.1 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | +| v0.9.1rc3 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1 | | +| v0.9.1rc2 | v0.9.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.5.1 / 2.5.1.post1| | +| v0.9.1rc1 | v0.9.1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250528 | | +| v0.9.0rc2 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.9.0rc1 | v0.9.0 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.8.5rc1 | v0.8.5.post1 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | | +| v0.8.4rc2 | v0.8.4 | >= 3.9, < 3.12 | 8.0.0 | 2.5.1 / 2.5.1 | | +| v0.7.3.post1| v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | 2.0rc1 | +| v0.7.3 | v0.7.3 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1 | 2.0rc1 | + +## Release cadence + +### release window + +| Date | Event | +|------------|-------------------------------------------| +| 2025.09.04 | Release candidates, v0.10.1rc1 | +| 2025.09.03 | v0.9.1 Final release | +| 2025.08.22 | Release candidates, v0.9.1rc3 | +| 2025.08.07 | Release candidates, v0.10.0rc1 | +| 2025.08.04 | Release candidates, v0.9.1rc2 | +| 2025.07.11 | Release candidates, v0.9.2rc1 | +| 2025.06.22 | Release candidates, v0.9.1rc1 | +| 2025.06.10 | Release candidates, v0.9.0rc2 | +| 2025.06.09 | Release candidates, v0.9.0rc1 | +| 2025.05.29 | v0.7.x post release, v0.7.3.post1 | +| 2025.05.08 | v0.7.x Final release, v0.7.3 | +| 2025.05.06 | Release candidates, v0.8.5rc1 | +| 2025.04.28 | Release candidates, v0.8.4rc2 | +| 2025.04.18 | Release candidates, v0.8.4rc1 | +| 2025.03.28 | Release candidates, v0.7.3rc2 | +| 2025.03.14 | Release candidates, v0.7.3rc1 | +| 2025.02.19 | Release candidates, v0.7.1rc1 | + +## Branch policy + +vLLM Ascend has main branch and dev branch. + +- **main**: main branch,corresponds to the vLLM main branch and latest 1 or 2 release version. It is continuously monitored for quality through Ascend CI. +- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version. + +Usually, a commit should be ONLY first merged in the main branch, and then backported to the dev branch to reduce maintenance costs as much as possible. + +### Maintenance branch and EOL: +The branch status will be in one of the following states: + +| Branch | Time frame | Summary | +|-------------------|----------------------------------|----------------------------------------------------------------------| +| Maintained | Approximately 2-3 minor versions | All bugfixes are appropriate. Releases produced, CI commitment. | +| Unmaintained | Community interest driven | All bugfixes are appropriate. No Releases produced, No CI commitment | +| End of Life (EOL) | N/A | Branch no longer accepting changes | + +### Branch state + +Note that vLLM Ascend will only be released for a certain vLLM release version rather than all versions. Hence, You might see only part of versions have dev branches (such as only `0.7.1-dev` / `0.7.3-dev` but no `0.7.2-dev`), this is as expected. + +Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Ascend version branch and support its latest version (for example, we plan to support version 0.7.3) as following shown: + +| Branch | Status | Note | +|------------|--------------|--------------------------------------| +| main | Maintained | CI commitment for vLLM main branch and vLLM 0.9.2 branch | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version | +| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version | +| v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev | + +### Feature branches + +| Branch | Status | RFC link | Merge plan | Mentor | +|------------|--------------|---------------------------------------|------------|--------| +|rfc/long_seq_optimization|Maintained|https://github.com/vllm-project/vllm/issues/22693|930|wangxiyuan| +- Branch: The feature branch should be created with a prefix `rfc/` followed by the feature name, such as `rfc/feature-name`. +- Status: The status of the feature branch is `Maintained` until it is merged into the main branch or deleted. +- RFC link: The feature branch should be created with a corresponding RFC issue. The creation of a feature branch requires an RFC and approval from at least two maintainers. +- Merge plan: The final goal of a feature branch is to merge it into the main branch. If it exceeds 3 months, the mentor maintainer should evaluate whether to delete the branch. +- Mentor: The mentor should be a vLLM Ascend maintainer who is responsible for the feature branch. + +### Backward compatibility + +For main branch, vLLM Ascend should works with vLLM main branch and latest 1 or 2 release version. So to ensure the backward compatibility, we will do the following: +- Both main branch and target vLLM release is tested by Ascend E2E CI. For example, currently, vLLM main branch and vLLM 0.8.4 are tested now. +- For code changes, we will make sure that the changes are compatible with the latest 1 or 2 vLLM release version as well. In this case, vLLM Ascend introduced a version check machinism inner the code. It'll check the version of installed vLLM package first to decide which code logic to use. If users hit the `InvalidVersion` error, it sometimes means that they have installed an dev/editable version of vLLM package. In this case, we provide the env variable `VLLM_VERSION` to let users specify the version of vLLM package to use. +- For documentation changes, we will make sure that the changes are compatible with the latest 1 or 2 vLLM release version as well. Note should be added if there are any breaking changes. + +## Document Branch Policy +To reduce maintenance costs, **all branch documentation content should remain consistent, and version differences can be controlled via variables in [docs/source/conf.py](https://github.com/vllm-project/vllm-ascend/blob/main/docs/source/conf.py)**. While this is not a simple task, it is a principle we should strive to follow. + +| Version | Purpose | Code Branch | +|-----|-----|---------| +| latest | Doc for the latest dev branch | vX.Y.Z-dev (Will be `main` after the first final release) | +| version | Doc for historical released versions | Git tags, like vX.Y.Z[rcN] | +| stable(not yet released) | Doc for latest final release branch | Will be `vX.Y.Z-dev` after the first official release | + +As shown above: + +- `latest` documentation: Matches the current maintenance branch `vX.Y.Z-dev` (Will be `main` after the first final release). Continuously updated to ensure usability for the latest release. +- `version` documentation: Corresponds to specific released versions (e.g., `v0.7.3`, `v0.7.3rc1`). No further updates after release. +- `stable` documentation (**not yet released**): Official release documentation. Updates are allowed in real-time after release, typically based on vX.Y.Z-dev. Once stable documentation is available, non-stable versions should display a header warning: `You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.`. + +## Software Dependency Management +- `torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable version to [PyPi](https://pypi.org/project/torch-npu) + every 3 months, a development version (aka the POC version) every month, and a nightly version every day. + The PyPi stable version **CAN** be used in vLLM Ascend final version, the monthly dev version **ONLY CANN** be used in + vLLM Ascend RC version for rapid iteration, the nightly version **CANNOT** be used in vLLM Ascend any version and branches. diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..82d7a28 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,142 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/docs/source/conf.py +# + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import json +import os + +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +# -- Project information ----------------------------------------------------- + +project = 'vllm-ascend' +copyright = '2025, vllm-ascend team' +author = 'the vllm-ascend team' + +# The full version, including alpha/beta/rc tags +release = '' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. + +# Copy from https://github.com/vllm-project/vllm/blob/main/docs/source/conf.py +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx_copybutton", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "myst_parser", + "sphinxarg.ext", + "sphinx_design", + "sphinx_togglebutton", + "sphinx_substitution_extensions", +] + +myst_enable_extensions = ["colon_fence", "substitution"] + +# Change this when cut down release +myst_substitutions = { + # the branch of vllm, used in vllm clone + # - main branch: 'main' + # - vX.Y.Z branch: 'vX.Y.Z' + 'vllm_version': 'v0.10.1.1', + # the branch of vllm-ascend, used in vllm-ascend clone and image tag + # - main branch: 'main' + # - vX.Y.Z branch: latest vllm-ascend release tag + 'vllm_ascend_version': 'v0.10.1rc1', + # the newest release version of vllm-ascend and matched vLLM, used in pip install. + # This value should be updated when cut down release. + 'pip_vllm_ascend_version': "0.10.1rc1", + 'pip_vllm_version': "0.10.1.1", + # CANN image tag + 'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11", + # vllm version in ci + 'ci_vllm_version': 'v0.10.1.1', +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +locale_dirs = ['locale/'] +gettext_compact = False +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [ + '_build', + 'Thumbs.db', + '.DS_Store', + '.venv', + 'README.md', + 'user_guide/release.template.md', + # TODO(yikun): Remove this after zh supported + '**/*.zh.md' +] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_title = project +html_theme = 'sphinx_book_theme' +html_logo = 'logos/vllm-ascend-logo-text-light.png' +html_theme_options = { + 'path_to_docs': 'docs/source', + 'repository_url': 'https://github.com/vllm-project/vllm-ascend', + 'use_repository_button': True, + 'use_edit_page_button': True, +} +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') +if READTHEDOCS_VERSION_TYPE == "tag": + # remove the warning banner if the version is a tagged release + header_file = os.path.join(os.path.dirname(__file__), + "_templates/sections/header.html") + # The file might be removed already if the build is triggered multiple times + # (readthedocs build both HTML and PDF versions separately) + if os.path.exists(header_file): + os.remove(header_file) + + +def setup(app): + pass + + +if __name__ == "__main__": + print(json.dumps(myst_substitutions)) diff --git a/docs/source/developer_guide/contribution/index.md b/docs/source/developer_guide/contribution/index.md new file mode 100644 index 0000000..82280ed --- /dev/null +++ b/docs/source/developer_guide/contribution/index.md @@ -0,0 +1,111 @@ +# Contributing + +## Building and testing +It's recommended to set up a local development environment to build and test +before you submit a PR. + +### Setup development environment + +Theoretically, the vllm-ascend build is only supported on Linux because +`vllm-ascend` dependency `torch_npu` only supports Linux. + +But you can still set up dev env on Linux/Windows/macOS for linting and basic +test as following commands: + +#### Run lint locally + +```bash +# Choose a base dir (~/vllm-project/) and set up venv +cd ~/vllm-project/ +python3 -m venv .venv +source ./.venv/bin/activate + +# Clone vllm-ascend and install +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend + +# Install lint requirement and enable pre-commit hook +pip install -r requirements-lint.txt + +# Run lint (You need install pre-commits deps via proxy network at first time) +bash format.sh +``` + +#### Run CI locally + +After complete "Run lint" setup, you can run CI locally: + +```{code-block} bash + :substitutions: + +cd ~/vllm-project/ + +# Run CI need vLLM installed +git clone --branch |vllm_version| https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements/build.txt +VLLM_TARGET_DEVICE="empty" pip install . +cd .. + +# Install requirements +cd vllm-ascend +# For Linux: +pip install -r requirements-dev.txt +# For non Linux: +cat requirements-dev.txt | grep -Ev '^#|^--|^$|^-r' | while read PACKAGE; do pip install "$PACKAGE"; done +cat requirements.txt | grep -Ev '^#|^--|^$|^-r' | while read PACKAGE; do pip install "$PACKAGE"; done + +# Run ci: +bash format.sh ci +``` + +#### Submit the commit + +```bash +# Commit changed files using `-s` +git commit -sm "your commit info" +``` + +🎉 Congratulations! You have completed the development environment setup. + +### Test locally + +You can refer to [Testing](./testing.md) doc to help you setup testing environment and running tests locally. + +## DCO and Signed-off-by + +When contributing changes to this project, you must agree to the DCO. Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO. + +Using `-s` with `git commit` will automatically add this header. + +## PR Title and Classification + +Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: + +- `[Attention]` for new features or optimization in attention. +- `[Communicator]` for new features or optimization in communicators. +- `[ModelRunner]` for new features or optimization in model runner. +- `[Platform]` for new features or optimization in platform. +- `[Worker]` for new features or optimization in worker. +- `[Core]` for new features or optimization in the core vllm-ascend logic (such as platform, attention, communicators, model runner) +- `[Kernel]` changes affecting compute kernels and ops. +- `[Bugfix]` for bug fixes. +- `[Doc]` for documentation fixes and improvements. +- `[Test]` for tests (such as unit tests). +- `[CI]` for build or continuous integration improvements. +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. + +:::{note} +If the PR spans more than one category, please include all relevant prefixes. +::: + +## Others + +You may find more information about contributing to vLLM Ascend backend plugin on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). +If you find any problem when contributing, you can feel free to submit a PR to improve the doc to help other developers. + +:::{toctree} +:caption: Index +:maxdepth: 1 +testing +::: diff --git a/docs/source/developer_guide/contribution/testing.md b/docs/source/developer_guide/contribution/testing.md new file mode 100644 index 0000000..d177308 --- /dev/null +++ b/docs/source/developer_guide/contribution/testing.md @@ -0,0 +1,285 @@ +# Testing + +This secition explains how to write e2e tests and unit tests to verify the implementation of your feature. + +## Setup test environment + +The fastest way to setup test environment is to use the main branch container image: + +:::::{tab-set} +:sync-group: e2e + +::::{tab-item} Local (CPU) +:selected: +:sync: cpu + +You can run the unit tests on CPU with the following steps: + +```{code-block} bash + :substitutions: + +cd ~/vllm-project/ +# ls +# vllm vllm-ascend + +# Use mirror to speedup download +# docker pull quay.nju.edu.cn/ascend/cann:|cann_image_tag| +export IMAGE=quay.io/ascend/cann:|cann_image_tag| +docker run --rm --name vllm-ascend-ut \ + -v $(pwd):/vllm-project \ + -v ~/.cache:/root/.cache \ + -ti $IMAGE bash + +# (Optional) Configure mirror to speedup download +sed -i 's|ports.ubuntu.com|mirrors.huaweicloud.com|g' /etc/apt/sources.list +pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple/ + +# For torch-npu dev version or x86 machine +export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi" + +apt-get update -y +apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2 + +# Install vllm +cd /vllm-project/vllm +VLLM_TARGET_DEVICE=empty python3 -m pip -v install . + +# Install vllm-ascend +cd /vllm-project/vllm-ascend +# [IMPORTANT] Import LD_LIBRARY_PATH to enumerate the CANN environment under CPU +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -m)-linux/devlib +python3 -m pip install -r requirements-dev.txt +python3 -m pip install -v . +``` + +:::: + +::::{tab-item} Single card +:sync: single + +```{code-block} bash + :substitutions: + +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci0 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:main +docker run --rm \ + --name vllm-ascend \ + --device $DEVICE \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/.cache:/root/.cache \ + -p 8000:8000 \ + -it $IMAGE bash +``` + +After starting the container, you should install the required packages: + +```bash +# Prepare +pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + +# Install required packages +pip install -r requirements-dev.txt +``` + +:::: + +::::{tab-item} Multi cards +:sync: multi + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:main +docker run --rm \ + --name vllm-ascend \ + --device /dev/davinci0 \ + --device /dev/davinci1 \ + --device /dev/davinci2 \ + --device /dev/davinci3 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/.cache:/root/.cache \ + -p 8000:8000 \ + -it $IMAGE bash +``` + +After starting the container, you should install the required packages: + +```bash +cd /vllm-workspace/vllm-ascend/ + +# Prepare +pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + +# Install required packages +pip install -r requirements-dev.txt +``` + +:::: + +::::: + +## Running tests + +### Unit test + +There are several principles to follow when writing unit tests: + +- The test file path should be consistent with source file and start with `test_` prefix, such as: `vllm_ascend/worker/worker_v1.py` --> `tests/ut/worker/test_worker_v1.py` +- The vLLM Ascend test are using unittest framework, see [here](https://docs.python.org/3/library/unittest.html#module-unittest) to understand how to write unit tests. +- All unit tests can be run on CPU, so you must mock the device-related function to host. +- Example: [tests/ut/test_ascend_config.py](https://github.com/vllm-project/vllm-ascend/blob/main/tests/ut/test_ascend_config.py). +- You can run the unit tests using `pytest`: + +:::::{tab-set} +:sync-group: e2e + +::::{tab-item} Local (CPU) +:selected: +:sync: cpu + +```bash +# Run unit tests +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -m)-linux/devlib +TORCH_DEVICE_BACKEND_AUTOLOAD=0 pytest -sv tests/ut +``` + +:::: + +::::{tab-item} Single card +:sync: single + +```bash +cd /vllm-workspace/vllm-ascend/ +# Run all single card the tests +pytest -sv tests/ut + +# Run single test +pytest -sv tests/ut/test_ascend_config.py +``` + +:::: + +::::{tab-item} Multi cards test +:sync: multi + +```bash +cd /vllm-workspace/vllm-ascend/ +# Run all single card the tests +pytest -sv tests/ut + +# Run single test +pytest -sv tests/ut/test_ascend_config.py +``` + +:::: + +::::: + +### E2E test + +Although vllm-ascend CI provide [e2e test](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml) on Ascend CI, you can run it +locally. + +:::::{tab-set} +:sync-group: e2e + +::::{tab-item} Local (CPU) +:sync: cpu + +You can't run e2e test on CPU. +:::: + +::::{tab-item} Single card +:selected: +:sync: single + +```bash +cd /vllm-workspace/vllm-ascend/ +# Run all single card the tests +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/ + +# Run a certain test script +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py + +# Run a certain case in test script +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models +``` + +:::: + +::::{tab-item} Multi cards test +:sync: multi + +```bash +cd /vllm-workspace/vllm-ascend/ +# Run all single card the tests +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/ + +# Run a certain test script +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/test_dynamic_npugraph_batchsize.py + +# Run a certain case in test script +VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/test_offline_inference.py::test_models +``` + +:::: + +::::: + +This will reproduce e2e test: [vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml). + +#### E2E test example: + +- Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py) +- Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py) +- Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py) +- Reduced Layer model test example: [test_torchair_graph_mode.py - DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48) + + The CI resource is limited, you might need to reduce layer number of the model, below is an example of how to generate a reduced layer model: + 1. Fork the original model repo in modelscope, we need all the files in the repo except for weights. + 2. Set `num_hidden_layers` to the expected number of layers, e.g., `{"num_hidden_layers": 2,}` + 3. Copy the following python script as `generate_random_weight.py`. Set the relevant parameters `MODEL_LOCAL_PATH`, `DIST_DTYPE` and `DIST_MODEL_PATH` as needed: + + ```python + import torch + from transformers import AutoTokenizer, AutoConfig + from modeling_deepseek import DeepseekV3ForCausalLM + from modelscope import snapshot_download + + MODEL_LOCAL_PATH = "~/.cache/modelscope/models/vllm-ascend/DeepSeek-V3-Pruning" + DIST_DTYPE = torch.bfloat16 + DIST_MODEL_PATH = "./random_deepseek_v3_with_2_hidden_layer" + + config = AutoConfig.from_pretrained(MODEL_LOCAL_PATH, trust_remote_code=True) + model = DeepseekV3ForCausalLM(config) + model = model.to(DIST_DTYPE) + model.save_pretrained(DIST_MODEL_PATH) + ``` + +### Run doctest + +vllm-ascend provides a `vllm-ascend/tests/e2e/run_doctests.sh` command to run all doctests in the doc files. +The doctest is a good way to make sure the docs are up to date and the examples are executable, you can run it locally as follows: + +```bash +# Run doctest +/vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh +``` + +This will reproduce the same environment as the CI: [vllm_ascend_doctest.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml). diff --git a/docs/source/developer_guide/evaluation/accuracy_report/index.md b/docs/source/developer_guide/evaluation/accuracy_report/index.md new file mode 100644 index 0000000..0ed0a18 --- /dev/null +++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md @@ -0,0 +1,6 @@ +# Accuracy Report + +:::{toctree} +:caption: Accuracy Report +:maxdepth: 1 +::: diff --git a/docs/source/developer_guide/evaluation/index.md b/docs/source/developer_guide/evaluation/index.md new file mode 100644 index 0000000..16a80de --- /dev/null +++ b/docs/source/developer_guide/evaluation/index.md @@ -0,0 +1,10 @@ +# Accuracy + +:::{toctree} +:caption: Accuracy +:maxdepth: 1 +using_evalscope +using_lm_eval +using_opencompass +accuracy_report/index +::: diff --git a/docs/source/developer_guide/evaluation/using_evalscope.md b/docs/source/developer_guide/evaluation/using_evalscope.md new file mode 100644 index 0000000..859f1c2 --- /dev/null +++ b/docs/source/developer_guide/evaluation/using_evalscope.md @@ -0,0 +1,175 @@ +# Using EvalScope + +This document will guide you have model inference stress testing and accuracy testing using [EvalScope](https://github.com/modelscope/evalscope). + +## 1. Online serving + +You can run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 +``` + +If your service start successfully, you can see the info shown below: + +``` +INFO: Started server process [6873] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once your server is started, you can query the model with input prompts in new terminal: + +``` +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "prompt": "The future of AI is", + "max_tokens": 7, + "temperature": 0 + }' +``` + +## 2. Install EvalScope using pip + +You can install EvalScope by using: + +```bash +python3 -m venv .venv-evalscope +source .venv-evalscope/bin/activate +pip install gradio plotly evalscope +``` + +## 3. Run gsm8k accuracy test using EvalScope + +You can `evalscope eval` run gsm8k accuracy test: + +``` +evalscope eval \ + --model Qwen/Qwen2.5-7B-Instruct \ + --api-url http://localhost:8000/v1 \ + --api-key EMPTY \ + --eval-type service \ + --datasets gsm8k \ + --limit 10 +``` + +After 1-2 mins, the output is as shown below: + +```shell ++---------------------+-----------+-----------------+----------+-------+---------+---------+ +| Model | Dataset | Metric | Subset | Num | Score | Cat.0 | ++=====================+===========+=================+==========+=======+=========+=========+ +| Qwen2.5-7B-Instruct | gsm8k | AverageAccuracy | main | 10 | 0.8 | default | ++---------------------+-----------+-----------------+----------+-------+---------+---------+ +``` + +See more detail in: [EvalScope doc - Model API Service Evaluation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-api-service-evaluation). + +## 4. Run model inference stress testing using EvalScope + +### Install EvalScope[perf] using pip + +```shell +pip install evalscope[perf] -U +``` + +### Basic usage + +You can use `evalscope perf` run perf test: + +``` +evalscope perf \ + --url "http://localhost:8000/v1/chat/completions" \ + --parallel 5 \ + --model Qwen/Qwen2.5-7B-Instruct \ + --number 20 \ + --api openai \ + --dataset openqa \ + --stream +``` + +### Output results + +After 1-2 mins, the output is as shown below: + +```shell +Benchmarking summary: ++-----------------------------------+---------------------------------------------------------------+ +| Key | Value | ++===================================+===============================================================+ +| Time taken for tests (s) | 38.3744 | ++-----------------------------------+---------------------------------------------------------------+ +| Number of concurrency | 5 | ++-----------------------------------+---------------------------------------------------------------+ +| Total requests | 20 | ++-----------------------------------+---------------------------------------------------------------+ +| Succeed requests | 20 | ++-----------------------------------+---------------------------------------------------------------+ +| Failed requests | 0 | ++-----------------------------------+---------------------------------------------------------------+ +| Output token throughput (tok/s) | 132.6926 | ++-----------------------------------+---------------------------------------------------------------+ +| Total token throughput (tok/s) | 158.8819 | ++-----------------------------------+---------------------------------------------------------------+ +| Request throughput (req/s) | 0.5212 | ++-----------------------------------+---------------------------------------------------------------+ +| Average latency (s) | 8.3612 | ++-----------------------------------+---------------------------------------------------------------+ +| Average time to first token (s) | 0.1035 | ++-----------------------------------+---------------------------------------------------------------+ +| Average time per output token (s) | 0.0329 | ++-----------------------------------+---------------------------------------------------------------+ +| Average input tokens per request | 50.25 | ++-----------------------------------+---------------------------------------------------------------+ +| Average output tokens per request | 254.6 | ++-----------------------------------+---------------------------------------------------------------+ +| Average package latency (s) | 0.0324 | ++-----------------------------------+---------------------------------------------------------------+ +| Average package per request | 254.6 | ++-----------------------------------+---------------------------------------------------------------+ +| Expected number of requests | 20 | ++-----------------------------------+---------------------------------------------------------------+ +| Result DB path | outputs/20250423_002442/Qwen2.5-7B-Instruct/benchmark_data.db | ++-----------------------------------+---------------------------------------------------------------+ + +Percentile results: ++------------+----------+---------+-------------+--------------+---------------+----------------------+ +| Percentile | TTFT (s) | ITL (s) | Latency (s) | Input tokens | Output tokens | Throughput(tokens/s) | ++------------+----------+---------+-------------+--------------+---------------+----------------------+ +| 10% | 0.0962 | 0.031 | 4.4571 | 42 | 135 | 29.9767 | +| 25% | 0.0971 | 0.0318 | 6.3509 | 47 | 193 | 30.2157 | +| 50% | 0.0987 | 0.0321 | 9.3387 | 49 | 285 | 30.3969 | +| 66% | 0.1017 | 0.0324 | 9.8519 | 52 | 302 | 30.5182 | +| 75% | 0.107 | 0.0328 | 10.2391 | 55 | 313 | 30.6124 | +| 80% | 0.1221 | 0.0329 | 10.8257 | 58 | 330 | 30.6759 | +| 90% | 0.1245 | 0.0333 | 13.0472 | 62 | 404 | 30.9644 | +| 95% | 0.1247 | 0.0336 | 14.2936 | 66 | 432 | 31.6691 | +| 98% | 0.1247 | 0.0353 | 14.2936 | 66 | 432 | 31.6691 | +| 99% | 0.1247 | 0.0627 | 14.2936 | 66 | 432 | 31.6691 | ++------------+----------+---------+-------------+--------------+---------------+----------------------+ +``` + +See more detail in: [EvalScope doc - Model Inference Stress Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-usage). diff --git a/docs/source/developer_guide/evaluation/using_lm_eval.md b/docs/source/developer_guide/evaluation/using_lm_eval.md new file mode 100644 index 0000000..799eff1 --- /dev/null +++ b/docs/source/developer_guide/evaluation/using_lm_eval.md @@ -0,0 +1,300 @@ +# Using lm-eval +This document will guide you have a accuracy testing using [lm-eval][1]. + +## Online Server +### 1. start the vLLM server +You can run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +/bin/bash +vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 4096 & +``` + +Started the vLLM server successfully,if you see log as below: + +``` +INFO: Started server process [9446] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +### 2. Run gsm8k accuracy test using lm-eval + +You can query result with input prompts: + +``` +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "'"<|im_start|>system\nYou are a professional accountant. Answer questions using accounting knowledge, output only the option letter (A/B/C/D).<|im_end|>\n"\ +"<|im_start|>user\nQuestion: A company's balance sheet as of December 31, 2023 shows:\n"\ +" Current assets: Cash and equivalents 5 million yuan, Accounts receivable 8 million yuan, Inventory 6 million yuan\n"\ +" Non-current assets: Net fixed assets 12 million yuan\n"\ +" Current liabilities: Short-term loans 4 million yuan, Accounts payable 3 million yuan\n"\ +" Non-current liabilities: Long-term loans 9 million yuan\n"\ +" Owner's equity: Paid-in capital 10 million yuan, Retained earnings ?\n"\ +"Requirement: Calculate the company's Asset-Liability Ratio and Current Ratio (round to two decimal places).\n"\ +"Options:\n"\ +"A. Asset-Liability Ratio=58.33%, Current Ratio=1.90\n"\ +"B. Asset-Liability Ratio=62.50%, Current Ratio=2.17\n"\ +"C. Asset-Liability Ratio=65.22%, Current Ratio=1.75\n"\ +"D. Asset-Liability Ratio=68.00%, Current Ratio=2.50<|im_end|>\n"\ +"<|im_start|>assistant\n"'", + "max_tokens": 1, + "temperature": 0, + "stop": ["<|im_end|>"] + }' | python3 -m json.tool +``` + +The output format matches the following: + +``` +{ + "id": "cmpl-2f678e8bdf5a4b209a3f2c1fa5832e25", + "object": "text_completion", + "created": 1754475138, + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "choices": [ + { + "index": 0, + "text": "A", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "service_tier": null, + "system_fingerprint": null, + "usage": { + "prompt_tokens": 252, + "total_tokens": 253, + "completion_tokens": 1, + "prompt_tokens_details": null + }, + "kv_transfer_params": null +} +``` + +Install lm-eval in the container. + +```bash +export HF_ENDPOINT="https://hf-mirror.com" +pip install lm-eval[api] +``` + +Run the following command: + +``` +# Only test gsm8k dataset in this demo +lm_eval \ + --model local-completions \ + --model_args model=Qwen/Qwen2.5-0.5B-Instruct,base_url=http://127.0.0.1:8000/v1/completions,tokenized_requests=False,trust_remote_code=True \ + --tasks gsm8k \ + --output_path ./ +``` + +After 30 mins, the output is as shown below: + +``` +The markdown format results is as below: + +Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3215|± |0.0129| +| | |strict-match | 5|exact_match|↑ |0.2077|± |0.0112| + +``` + +## Offline Server +### 1. Run docker container + +You can run docker container on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +/bin/bash +``` + +### 2. Run gsm8k accuracy test using lm-eval +Install lm-eval in the container. + +```bash +export HF_ENDPOINT="https://hf-mirror.com" +pip install lm-eval +``` + +Run the following command: + +``` +# Only test gsm8k dataset in this demo +lm_eval \ + --model vllm \ + --model_args pretrained=Qwen/Qwen2.5-0.5B-Instruct,max_model_len=4096 \ + --tasks gsm8k \ + --batch_size auto +``` + +After 1-2 mins, the output is as shown below: + +``` +The markdown format results is as below: + +Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.3412|± |0.0131| +| | |strict-match | 5|exact_match|↑ |0.3139|± |0.0128| + +``` + +## Use offline Datasets + +Take gsm8k(single dataset) and mmlu(multi-subject dataset) as examples, and you can see more from [here][2]. + +```bash +# set HF_DATASETS_OFFLINE when using offline datasets +export HF_DATASETS_OFFLINE=1 +git clone https://github.com/EleutherAI/lm-evaluation-harness.git +cd lm-evaluation-harness +pip install -e . +# gsm8k yaml path +cd lm_eval/tasks/gsm8k +# mmlu yaml path +cd lm_eval/tasks/mmlu/default +``` + +set [gsm8k.yaml][3] as follows: + +```yaml +tag: + - math_word_problems +task: gsm8k + +# set dataset_path arrow or json or parquet according to the downloaded dataset +dataset_path: arrow + +# set dataset_name to null +dataset_name: null +output_type: generate_until + +# add dataset_kwargs +dataset_kwargs: + data_files: + # train and test data download path + train: /root/.cache/gsm8k/gsm8k-train.arrow + test: /root/.cache/gsm8k/gsm8k-test.arrow + +training_split: train +fewshot_split: train +test_split: test +doc_to_text: 'Q: {{question}} + A(Please follow the summarize the result at the end with the format of "The answer is xxx", where xx is the result.):' +doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" + - "(?s).*#### " + - "\\.$" +generation_kwargs: + until: + - "Question:" + - "" + - "<|im_end|>" + do_sample: false + temperature: 0.0 +repeats: 1 +num_fewshot: 5 +filter_list: + - name: "strict-match" + filter: + - function: "regex" + regex_pattern: "#### (\\-?[0-9\\.\\,]+)" + - function: "take_first" + - name: "flexible-extract" + filter: + - function: "regex" + group_select: -1 + regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)" + - function: "take_first" +metadata: + version: 3.0 +``` + +set [_default_template_yaml][4] as follows: + +```yaml +# set dataset_path according to the downloaded dataset +dataset_path: /root/.cache/mmlu +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true +``` + +You can see more usage on [Lm-eval Docs][5]. + +[1]: https://github.com/EleutherAI/lm-evaluation-harness +[2]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#using-local-datasets +[3]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml +[4]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmlu/default/_default_template_yaml +[5]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/README.md diff --git a/docs/source/developer_guide/evaluation/using_opencompass.md b/docs/source/developer_guide/evaluation/using_opencompass.md new file mode 100644 index 0000000..4edc292 --- /dev/null +++ b/docs/source/developer_guide/evaluation/using_opencompass.md @@ -0,0 +1,123 @@ +# Using OpenCompass +This document will guide you have a accuracy testing using [OpenCompass](https://github.com/open-compass/opencompass). + +## 1. Online Serving + +You can run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240 +``` + +If your service start successfully, you can see the info shown below: + +``` +INFO: Started server process [6873] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once your server is started, you can query the model with input prompts in new terminal: + +``` +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "prompt": "The future of AI is", + "max_tokens": 7, + "temperature": 0 + }' +``` + +## 2. Run ceval accuracy test using OpenCompass +Install OpenCompass and configure the environment variables in the container. + +```bash +# Pin Python 3.10 due to: +# https://github.com/open-compass/opencompass/issues/1976 +conda create -n opencompass python=3.10 +conda activate opencompass +pip install opencompass modelscope[framework] +export DATASET_SOURCE=ModelScope +git clone https://github.com/open-compass/opencompass.git +``` + +Add `opencompass/configs/eval_vllm_ascend_demo.py` with the following content: + +```python +from mmengine.config import read_base +from opencompass.models import OpenAISDK + +with read_base(): + from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets + +# Only test ceval-computer_network dataset in this demo +datasets = ceval_datasets[:1] + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +models = [ + dict( + abbr='Qwen2.5-7B-Instruct-vLLM-API', + type=OpenAISDK, + key='EMPTY', # API key + openai_api_base='http://127.0.0.1:8000/v1', + path='Qwen/Qwen2.5-7B-Instruct', + tokenizer_path='Qwen/Qwen2.5-7B-Instruct', + rpm_verbose=True, + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + temperature=0.01, + batch_size=8, + retry=3, + ) +] +``` + +Run the following command: + +``` +python3 run.py opencompass/configs/eval_vllm_ascend_demo.py --debug +``` + +After 1-2 mins, the output is as shown below: + +``` +The markdown format results is as below: + +| dataset | version | metric | mode | Qwen2.5-7B-Instruct-vLLM-API | +|----- | ----- | ----- | ----- | -----| +| ceval-computer_network | db9ce2 | accuracy | gen | 68.42 | +``` + +You can see more usage on [OpenCompass Docs](https://opencompass.readthedocs.io/en/latest/index.html). diff --git a/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md b/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md new file mode 100644 index 0000000..e1ab3b7 --- /dev/null +++ b/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md @@ -0,0 +1,237 @@ +# Purpose +What information should we have in order to perform model forward pass? + - the inputs + - the corresponding attention metadata of the inputs + +The following diagram shows what we should prepare for the model inference. + +``` + +---------------+ + inputs --> | | + | model | --> output +attn_meta --> | | + +---------------+ +``` + +Therefore, as long as we have these two pieces of information mentioned above, we can perform the model's forward propagation. + +This article will explain **how we obtain the inputs and their corresponding attention metadata** which are on the left part of above diagram. + +# Overview +## 1. Obtain inputs +The workflow of obtain inputs: +1. Get `token positions`: The relative position of each token within its request sequence. + +2. Get `token indices`: the index of each scheduled token in the token table. + +3. Get `Token IDs`: Using token indices to retrieve the Token IDs from **token id table**. + +At last, these `Token IDs` required to feed into the model, and also, `positions` should be send into model to create `Rope` (Rotary positional embedding). Both of them are the inputs of a model. + +**Note**: because the `Token IDs` is the inputs of the model, so we will call it `Inputs IDs` +## 2. Build inputs attention metadata +The model requires these attention metadata during the forward pass: +- `query start location`: represents the start and end location of each request corresponding to the scheduled tokens. +- `sequence length`: the length of each request including both computed tokens and newly scheduled tokens. +- `number of computed tokens`: the number of computed tokens for each request. +- `number of requests`: the number of requests in this batch. +- `number of tokens`: Total number of scheduled tokens in this batch. +- **`block table`**: translates the logical address (within its sequence) of each block to its global physical address in the device's memory. +- `max query len`: the longest scheduled tokens length in this requests batch. +- `slot mapping`: the indices of each token that input token will be stored into. +- `attention mask`: The mask matrix applied to attention scores before softmax to control which tokens can attend to each other. (usually a causal attention) + +# Before start +There are mainly three types of variables. +- token level: represents one attribute corresponding to each scheduled token, so the length of this variable is the number of scheduled tokens +- request level: represents one attribute of each scheduled request, which length usually is the number of scheduled requests. (`query start location` is a special case, which has one more element) +- system level: + 1. **Token IDs table**: store the token ids (i.e. the inputs of the model) of each request. The shape of this table is `(max num request, max model len)`. Here, `max num request` is maximum count of concurrent requests allowed in a forward batch and `max model len` is the max token count can be handled at one request sequence in this model. + 2. **Block table**: translates the logical address (within its sequence) of each block to its global physical address in the device's memory. The shape of this table is `(max num request, max model len / block size)` + +**Note**: How were these two tables formed? +- Both of them are come from the `_update_states` method before **prepare inputs**. You can take a look if you need more inspiration. + +## Tips +What is `Token ID`? +For simple, a `token ID` is an **integer** (usually `int32`), which represents a token. +example of `Token ID`: + +``` +| Token ID | Token | +|--------------|---------------| +| 0 | [PAD] | +| 1 | <|endoftext|> | +| 2 | <|start|> | +| 3 | [SEP] | +| 4 | I | +| 5 | the | +| 6 | be | +| 7 | of | +| 8 | and | +| ... | ... | +| ... | ... | +| vocab_size-1 | <|im_end|> | +``` + +# Go through details +Make a simple example, assumption: +- max tokens can be scheduled at once: 10. +- `block size`: 2 +- Totally schedule 3 requests. Their prompt lengths are 3, 2, and 8 respectively. +- `max model length`: 12 (the max token count can be handled at one request sequence in this model). + +These assumption are configured in the beginning when starting the vllm. They are not fixed, so you can manually set them. +## Step 1: All requests in the prefill phase + +### Obtain inputs +Due to the max schedule token count limitation is 10, The scheduled token of each request: `{'0': 3, '1': 2, '2': 5}`. Note that the `request_2` is in chunked prefill, still has 3 prompt tokens not be scheduled. + +#### 1. Get token positions: +First, find out each token belong to which request: the 0~2 tokens belong to request_0, 3~4 tokens belong to request_1 and 5~9 tokens belong to request_2. So, we can use `request indices` to point out each token belongs to which request. `request indices`: `[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]` + +For each request, use **the number of tokens already computed** + **the relative position in current scheduled tokens**: `request_0: [0 + 0, 0 + 1, 0 + 2]`, `request_1: [0 + 0, 0 + 1]`, `request_2: [0 + 0, 0 + 1,..., 0 + 4]` and then concat them together: `[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`. Note: there is more efficient way (using `request indices`) to create positions in actual code. + +Finally, `token opsitions` is `[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`. This variable is **token level** + +#### 2. Get token indices: +Current **Token IDs table**, which shape is `(max num request, max model len)`. + +Why these `T_3_5`, `T_3_6`, `T_3_7` are in this table even them are not scheduled this time? +- We will fill all Token IDs in one request sequence to this table at once, but we only retrieve the tokens we scheduled this time. Then we will retrieve the remain Token IDs next time. + +``` +| T_0_0 | T_0_1 | T_0_2 | ? | ? | ? | ? | ? | ? | ? | ? | ? | +| T_1_0 | T_1_1 | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | +| T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 | ? | ? | ? | ? | +| ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | +...... +...... +...... +``` + +Note that the `T_x_x` is an `int32` + +Let's say `M = max model len`, Then we can use `token positions` together with the `request indices` of each token to construct `token indices`. + +So `token indices` = `[0 + 0 * M, 1 + 0 * M, 2 + 0 * M, 0 + 1 * M, 1 + 1 * M, 0 + 2 * M, 1 + 2 * M, 2 + 2 * M, 3 + 2 * M, 4 + 2 * M]` = `[0, 1, 2, 12, 13, 24, 25, 26, 27, 28]` + +#### 3. Retrieve the Token IDs +As mentioned before, we will refer to these `Token IDs` as `Input IDs`. + +We use the `token indices` to select out the corresponding `Input IDs` from the token table, The Pseudocode like: + +``` +input_ids = token_table[token_indices] +``` + +As mentioned before, we will refer these Token IDs as Inputs IDs: +- `Input IDs` = `[T_0_0, T_0_1, T_0_2, T_1_0, T_1_1, T_2_0, T_2_1, T_3_2, T_3_3, T_3_4]` + +### Build inputs attention metadata +Current **Block Table**, we use the first block (i.e. block_0) to mark the unused block. The shape of the block is `(max num request, max model len / block size)`, the `max model len / block size = 12 / 2 = 6` + +``` +| 1 | 2 | 0 | 0 | 0 | 0 | +| 3 | 0 | 0 | 0 | 0 | 0 | +| 4 | 5 | 6 | 0 | 0 | 0 | +| 0 | 0 | 0 | 0 | 0 | 0 | +...... +...... +...... +``` + +The kv cache block in the device memory is like: + +``` +| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... +``` + +Let's say `K = max model len / block size = 6`, we can get token `device block number` from + +The workflow of achieving slot mapping: +1. get `block table indices` using `K`, `positions` and `request indices`. Purpose: For each token, it could be used to select the `device block number` from `block table`. +2. get `device block number` using `block table indices`. Purpose: `device block number` indicates each token belong to which device block. +3. get `block offsets` using `positions` and `block size`. Purpose: `block offsets` indicates the offsets of each token within a block. +4. construct `slot mapping` using `device block number` and `block offsets`. Purpose: we can use `slot mapping` to store the Token IDs into token slots. + +Details: +1. Using a simple formula to calculate the `block table indices`: `request indices * K + positions / block size`. So it equal to `[0 * 6 + 0 / 2, 0 * 6 + 1 / 2, 0 * 6 + 2 / 2, 1 * 6 + 0 / 2, 1 * 6 + 1 / 2, 2 * 6 + 0 / 2, 2 * 6 + 1 / 2, 2 * 6 + 2 / 2, 2 * 6 + 3 / 2, 2 * 6 + 4 / 2] = [0, 0, 1, 6, 6, 12, 12, 13, 13, 14]`. This could be used to select the `device block number` from `block table`. **token level** +2. Using the `block table indices` to select out the `device block number` for each scheduled token. The Pseudocode like: `block_numbers = block_table[block_table_indices]`. So `device block number = [1, 1, 2, 3, 3, 4, 4, 5, 5, 6]`**token level** +3. `block offsets` could be computed by `block offsets = positions % block size = [0, 1, 0, 0, 1, 0, 1, 0, 1, 0]`. **token level** +4. At last, use `block offsets` and `device block number` to create `slot mapping`: `device block number * block size + block_offsets = [2, 3, 4, 6, 7, 8, 9, 10, 11, 12]` + +First, we know the scheduled token count is `[3, 2, 5]` **request level** + +- So, we can use prefix sum to calculate the `query start location`: `[0, 3, 5, 10]`. **request level** +- Because in step_1 all the tokens in prefill, computed tokens count is 0, then `sequence length` = `[3, 2, 5]`. **request level** +- As mentioned above, `number of computed tokens` are all 0: `[0, 0, 0]`. **request level** +- `number of requests`: `3`. +- `number of tokens`: `[3, 2, 5]`. **request level** +- `max query len`: `5`. +- `slot mapping`: `[2, 3, 4, 6, 7, 8, 9, 10, 11, 12]`. **token level** +- `attention mask`: For all request do prefill, we simply create only one mask matrix for reuse across different requests. The shape of this mask matrix is `5 * 5`: + +## Step 2: Chunked prefill +In Step 2, we will no longer provide explanations or perform calculations; instead, we will directly present the final result. + +### Obtain inputs +The scheduled token of each request: `{'0': 1, '1': 1, '2': 3}`. + +1. `request indices`: `[0, 1, 2, 2, 2]` +2. `token positions`: `[3, 2, 5, 6, 7]` + +Current **Token IDs table**: + +``` +| T_0_0 | T_0_1 | T_0_2 | T_0_3 | ? | ? | ? | ? | ? | ? | ? | ? | +| T_1_0 | T_1_1 | T_1_2 | ? | ? | ? | ? | ? | ? | ? | ? | ? | +| T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 | ? | ? | ? | ? | +| ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | ? | +...... +...... +...... +``` + +**Note**: The **T_0_3**, **T_1_2** are new Token IDs of request_0, request_1 respectively. them are sampled from the output of the model. + +3. `token indices`: `[3, 14, 29, 30, 31]` +4. `Input IDs`: `[T_0_3, T_1_2, T_3_5, T_3_6, T_3_7]` + +### Build inputs attention metadata +Current **Block Table**. **Note**: We allocate the `7` and `8` block to `request_1` and `request_2` respectively. Because they need more space in device to store kv cache after generate new tokens or chunked prefill new tokens. + +``` +| 1 | 2 | 0 | 0 | 0 | 0 | +| 3 | 7 | 0 | 0 | 0 | 0 | +| 4 | 5 | 6 | 8 | 0 | 0 | +| 0 | 0 | 0 | 0 | 0 | 0 | +...... +...... +...... +``` + +The kv cache block in the device memory is still like: + +``` +| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... +``` + +1. `block table indices`: `[1, 7, 14, 15, 15]`. **token level** +2. `device block number`: `[2, 7, 6, 8, 8]`. **token level** +3. `block offsets`: `[1, 0, 1, 0, 1]` **token level** +4. `slot mapping`: `[5, 14, 13, 16, 17]` **token level** + +scheduled token count is `[1, 1, 3]` +- `query start location`: `[0, 1, 2, 5]` +- `sequence length`: `[4, 3, 8]` +- `number of computed tokens`: `[3, 2, 5]` +- `number of requests`: `3` +- `max query len`: `3` +- `slot mapping`: `[5, 14, 13, 16, 17]` +- `attention mask`: `5 * 8` Each token will have a `1 * 8` vector, and there are 5 scheduled tokens. + +# At last +If you under stand the step_1 and step_2, you will know the all following steps. + +Hope this article can help you get better understand to how vllm prepare inputs for model forwarding. If you have any good idea, welcome to contribute to us. diff --git a/docs/source/developer_guide/feature_guide/index.md b/docs/source/developer_guide/feature_guide/index.md new file mode 100644 index 0000000..ce91062 --- /dev/null +++ b/docs/source/developer_guide/feature_guide/index.md @@ -0,0 +1,10 @@ +# Feature Guide + +This section provides an overview of the features implemented in vLLM Ascend. Developers can refer to this guide to understand how vLLM Ascend works. + +:::{toctree} +:caption: Feature Guide +:maxdepth: 1 +patch +ModelRunner_prepare_inputs +::: diff --git a/docs/source/developer_guide/feature_guide/patch.md b/docs/source/developer_guide/feature_guide/patch.md new file mode 100644 index 0000000..19bb288 --- /dev/null +++ b/docs/source/developer_guide/feature_guide/patch.md @@ -0,0 +1,85 @@ +# Patch in vLLM Ascend + +vLLM Ascend is a platform plugin for vLLM. Due to the release cycle of vLLM and vLLM Ascend is different, and the hardware limitation in some case, we need to patch some code in vLLM to make it compatible with vLLM Ascend. + +In vLLM Ascend code, we provide a patch module `vllm_ascend/patch` to address the change for vLLM. + +## Principle + +We should keep in mind that Patch is not the best way to make vLLM Ascend compatible. It's just a temporary solution. The best way is to contribute the change to vLLM to make it compatible with vLLM Ascend originally. In vLLM Ascend, we have the basic principle for Patch strategy: + +1. Less is more. Please do not patch unless it's the only way currently. +2. Once a patch is added, it's required to describe the future plan for removing the patch. +3. Anytime, clean the patch code is welcome. + +## How it works + +In `vllm_ascend/patch`, you can see the code structure as follows: + +``` +vllm_ascend +├── patch +│ ├── platform +│ │ ├── patch_0_9_2 +│ │ ├── patch_common +│ │ ├── patch_main +│ ├── worker +│ │ ├── patch_0_9_2 +│ │ ├── patch_common +│ │ ├── patch_main +└─────────── +``` + +- **platform**: The patch code in this directory is for patching the code in vLLM main process. It's called by `vllm_ascend/platform::NPUPlatform::pre_register_and_update` very early when vLLM is initialized. + - For online mode, vLLM process calls the platform patch here `vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing the cli args. + - For offline mode, vLLM process calls the platform patch here `vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when parsing the input parameters. +- **worker**: The patch code in this directory is for patching the code in vLLM worker process. It's called by `vllm_ascend/worker/worker_v1::NPUWorker::__init__` when the vLLM worker process is initialized. + - For both online and offline mode, vLLM engine core process calls the worker patch here `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` when initializing the worker process. + +In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM. + +- `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0. +- `patch_main`: This module is used for patching the code in vLLM main branch. +- `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch. + +## How to write a patch + +Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM. + +1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM. +2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`. +3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`. +4. Write your patch code in the new file. Here is an example: + + ```python + import vllm + + def patch_destroy_model_parallel(): + # your patch code + ... + + vllm.distributed.parallel_state.destroy_model_parallel = patch_destroy_model_parallel + ``` + +5. Import the patch file in `__init__.py`. In this example, add `import vllm_ascend.patch.platform.patch_common.patch_distributed` into `vllm_ascend/patch/platform/patch_common/__init__.py`. +6. Add the description of the patch in `vllm_ascend/patch/__init__.py`. The description format is as follows: + + ``` + # ** File: ** + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # 1. `` + # Why: + # + # How: + # + # Related PR (if no, explain why): + # + # Future Plan: + # + ``` + +7. Add the Unit Test and E2E Test. Any newly added code in vLLM Ascend should contain the Unit Test and E2E Test as well. You can find more details in [test guide](../contribution/testing.md) + +## Limitation +1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely. +2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work. diff --git a/docs/source/developer_guide/modeling/adding_a_new_model.md b/docs/source/developer_guide/modeling/adding_a_new_model.md new file mode 100644 index 0000000..117f559 --- /dev/null +++ b/docs/source/developer_guide/modeling/adding_a_new_model.md @@ -0,0 +1,259 @@ +# Adding a New Model + +This guide demonstrates how to integrate a novel or customized model into vllm-ascend. For foundational concepts, it is highly recommended to refer to +[vllm official doc: Adding a New Model](https://docs.vllm.ai/en/stable/contributing/model/) first. + +## Step 1: Implementing Models with `torch` and `torch_npu` + +This section provides instructions for implementing new models compatible with vllm and vllm-ascend. + +**Before starting:** + +- Verify whether your model already exists in vllm's [models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory. +- Use existing models' implementation as templates to accelerate your development. + +### Method 1: Implementing New Models from Scratch + +Follow vllm's [OPT model adaptation](https://docs.vllm.ai/en/stable/contributing/model/basic.html) example for guidance. + +**Key implementation requirements:** + +1. Place model files in `vllm_ascend/models/` directory. + +2. Standard module structure for decoder-only LLMs (please checkout vllm's implementations for other kinds of model): + +- `*ModelForCausalLM` (top-level wrapper) +- `*Model` (main architecture) +- `*DecoderLayer` (transformer block) +- `*Attention` and `*MLP` (specific computation unit) + +:::{note} +`*` denotes your model's unique identifier. +::: + +3. Critical Implementation Details: + +All modules must include a `prefix` argument in `__init__()`. + +**Required interfaces:** + +| Module Type | Required Methods | +| :------------------- | :---------------------------------------- | +| `*ModelForCausalLM` | `get_input_embeddings`, `compute_logits`, `load_weights` | +| `*Model` | `get_input_embeddings`, `load_weights` | + +4. Attention Backend Integration: + +Importing attention via `from vllm.attention import Attention` can automatically leverage the attention backend routing of vllm-ascend (see: `get_attn_backend_cls()` in `vllm_ascend/platform.py`). + +5. Tensor Parallelism: + +Use vllm's parallel layers (`ColumnParallelLinear`, `VocabParallelEmbedding`, etc.) to implement models supporting tensor parallelism. Note that Ascend-specific customizations are implemented in `vllm_ascend/ops/` directory (RMSNorm, VocabParallelEmbedding, etc.). + +**Reference Implementation Template** (assumed path: `vllm_ascend/models/custom_model.py`): + +```python +from collections.abc import Iterable +from typing import Optional, Union + +import torch +from torch import nn +from vllm.attention import Attention +from vllm.config import VllmConfig +from vllm.sequence import IntermediateTensors +from vllm.model_executor.sampling_metadata import SamplingMetadata + +class CustomAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # Implement attention logic + ... + +class CustomDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = CustomAttention(vllm_config, prefix=f"{prefix}.self_attn") + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # Implement decoder layer + ... + +class CustomModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList([ + CustomDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") + for i in range(vllm_config.model_config.hf_config.num_hidden_layers) + ]) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + ... + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + ... + + def load_weights(self, + weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + ... + +class CustomModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = CustomModel(vllm_config, prefix=f"{prefix}.model") + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + ... + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + ... + + def compute_logits(self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + ... + + def load_weights(self, + weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + ... +``` + +### Method 2: Customizing Existing vLLM Models + +For most use cases, extending existing implementations is preferable. We demonstrate an example to inherit from base classes and implement a custom deepseek model below (assumed path: `vllm_ascend/models/deepseek_v2.py`). + +```python +from typing import List, Optional +import torch +from vllm.attention import AttentionMetadata +from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM +from vllm.sequence import IntermediateTensors + +class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): + # Define merged weights for quantization/efficiency + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + # Custom forward logic + hidden_states = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds + ) + return hidden_states +``` + +:::{note} +For a complete implementation reference, see: `vllm_ascend/models/deepseek_v2.py`. +::: + +## Step 2: Registering Custom Models using ModelRegistry Plugins in vLLM + +vllm provides a plugin mechanism for registering externally implemented models without modifying its codebase. + +To integrate your implemented model from `vllm_ascend/models/` directory: + +1. Import your model implementation in `vllm_ascend/models/__init__.py` using relative imports. +2. Register the model wrapper class via `vllm.ModelRegistry.register_model()` function. + +**Reference Registration Template** (an example of registering new models in `vllm_ascend/models/__init__.py`): + +```python +from vllm import ModelRegistry + +def register_model(): + from .custom_model import CustomModelForCausalLM # New custom model + from .deepseek_v2 import ModifiedDeepseekV2ForCausalLM # Customized Deepseek + + # For NEW architectures: Register with unique name + ModelRegistry.register_model( + "CustomModelForCausalLM", # Must match config.json's 'architectures' + "vllm_ascend.models.custom_model:CustomModelForCausalLM" + ) + + # For MODIFIED architectures: Use original name + ModelRegistry.register_model( + "DeepseekV2ForCausalLM", # Original architecture identifier in vLLM + "vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM " + ) +``` + +:::{note} +The first argument of `vllm.ModelRegistry.register_model()` indicates the unique architecture identifier which must match `architectures` in `config.json` of the model. + +```json +{ + "architectures": [ + "CustomModelForCausalLM" + ], +} +``` + +::: + +## Step 3: Verification + +### Case 1: Overriding Existing vLLM Model Architecture + +If you're registering a customized model architecture based on vllm's existing implementation (overriding vllm's original class), when executing vllm offline/online inference (using any model), you'll observe warning logs similar to the following output from `vllm/models_executor/models/registry.py`. + +```bash +Model architecture DeepseekV2ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend/models/deepseek_v2:CustomDeepseekV2ForCausalLM. +``` + +### Case 2: Registering New Model Architecture + +If you're registering a novel model architecture not present in vllm (creating a completely new class), current logs won't provide explicit confirmation by default. It's recommended to add the following logging statement at the end of the `register_model` method in `vllm/models_executor/models/registry.py`. + +```python +logger.info(f"model_arch: {model_arch} has been registered here!") +``` + +After adding this line, you will see confirmation logs shown below when running vllm offline/online inference (using any model). + +```bash +model_arch: CustomModelForCausalLM has been registered here! +``` + +This log output confirms your novel model architecture has been successfully registered in vllm. + +## Step 4: Testing + +After adding a new model, we should do basic functional test (offline/online inference), accuracy test and performance benchmark for the model. + +Find more details at: + +- [Accuracy test guide](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html) +- [Performance benchmark guide](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html) + +## Step 5: Updating Supported Models Doc + +At last, if all the steps above are completed, you should add the new model into our [Supported Models](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html) doc. diff --git a/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md b/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md new file mode 100644 index 0000000..03fadf3 --- /dev/null +++ b/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md @@ -0,0 +1,3 @@ +# Adding a New Multi-Modal Model + +**_Comming soon ..._** diff --git a/docs/source/developer_guide/modeling/index.md b/docs/source/developer_guide/modeling/index.md new file mode 100644 index 0000000..96eeb50 --- /dev/null +++ b/docs/source/developer_guide/modeling/index.md @@ -0,0 +1,10 @@ +# Modeling + +This section provides tutorials of how to implement and register a new model into vllm-ascend. + +:::{toctree} +:caption: Modeling +:maxdepth: 1 +adding_a_new_model +adding_a_new_multimodal_model +::: diff --git a/docs/source/developer_guide/performance/index.md b/docs/source/developer_guide/performance/index.md new file mode 100644 index 0000000..0fa1466 --- /dev/null +++ b/docs/source/developer_guide/performance/index.md @@ -0,0 +1,9 @@ +# Performance + +:::{toctree} +:caption: Performance +:maxdepth: 1 +performance_benchmark +profile_execute_duration +optimization_and_tuning +::: diff --git a/docs/source/developer_guide/performance/optimization_and_tuning.md b/docs/source/developer_guide/performance/optimization_and_tuning.md new file mode 100644 index 0000000..61e761a --- /dev/null +++ b/docs/source/developer_guide/performance/optimization_and_tuning.md @@ -0,0 +1,183 @@ +# Optimization and Tuning + +This guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. Any feedback is welcome. + +## Preparation + +Run the container: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci0 +# Update the cann base image +export IMAGE=m.daocloud.io/quay.io/ascend/cann:|cann_image_tag| +docker run --rm \ +--name performance-test \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-it $IMAGE bash +``` + +Configure your environment: + +```{code-block} bash + :substitutions: +# Configure the mirror +echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \ +echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \ +echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list + +# Install os packages +apt update && apt install wget gcc g++ libnuma-dev git vim -y +``` + +Install vllm and vllm-ascend: + +```{code-block} bash + :substitutions: +# Install necessary dependencies +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip install modelscope pandas datasets gevent sacrebleu rouge_score pybind11 pytest + +# Configure this var to speed up model download +VLLM_USE_MODELSCOPE=true +``` + +Please follow the [Installation Guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) to make sure vllm, vllm-ascend and mindie-turbo is installed correctly. + +:::{note} +Make sure your vllm and vllm-ascend are installed after your python configuration completed, because these packages will build binary files using the python in current environment. If you install vllm, vllm-ascend and mindie-turbo before chapter 1.1, the binary files will not use the optimized python. +::: + +## Optimizations + +### 1. Compilation Optimization + +#### 1.1. Install optimized `python` + +Python supports **LTO** and **PGO** optimization starting from version `3.6` and above, which can be enabled at compile time. And we have offered compilation optimized `python` packages directly to users for the sake of convenience. You can also reproduce the `python` build follow this [tutorial](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html) according to your specific scenarios. + +```{code-block} bash + :substitutions: +mkdir -p /workspace/tmp +cd /workspace/tmp + +# Download prebuilt lib and packages +wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libcrypto.so.1.1 +wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libomp.so +wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libssl.so.1.1 +wget https://repo.oepkgs.net/ascend/pytorch/vllm/python/py311_bisheng.tar.gz + +# Configure python and pip +cp ./*.so* /usr/local/lib +tar -zxvf ./py311_bisheng.* -C /usr/local/ +mv /usr/local/py311_bisheng/ /usr/local/python +sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3 +sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3.11 +ln -sf /usr/local/python/bin/python3 /usr/bin/python +ln -sf /usr/local/python/bin/python3 /usr/bin/python3 +ln -sf /usr/local/python/bin/python3.11 /usr/bin/python3.11 +ln -sf /usr/local/python/bin/pip3 /usr/bin/pip3 +ln -sf /usr/local/python/bin/pip3 /usr/bin/pip + +export PATH=/usr/bin:/usr/local/python/bin:$PATH +``` + +### 2. OS Optimization + +#### 2.1. jemalloc + +**jemalloc** is a memory allocator that improves performance for multi-threads scenario and can reduce memory fragment. jemalloc use thread local memory manager to allocate variables, which can avoid lock competition between multi-threads and can hugely optimize performance. + +```{code-block} bash + :substitutions: +# Install jemalloc +sudo apt update +sudo apt install libjemalloc2 + +# Configure jemalloc +export LD_PRELOAD=/usr/lib/"$(uname -i)"-linux-gnu/libjemalloc.so.2 $LD_PRELOAD +``` + +#### 2.2. Tcmalloc + +**Tcmalloc (Thread Counting Malloc)** is a universal memory allocator that improves overall performance while ensuring low latency by introducing a multi-level cache structure, reducing mutex competition and optimizing large object processing flow. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/700/ptmoddevg/trainingmigrguide/performance_tuning_0068.html). + +```{code-block} bash + :substitutions: +# Install tcmalloc +sudo apt update +sudo apt install libgoogle-perftools4 libgoogle-perftools-dev + +# Get the location of libtcmalloc.so* +find /usr -name libtcmalloc.so* + +# Make the priority of tcmalloc higher +# The is the location of libtcmalloc.so we get from the upper command +# Example: "$LD_PRELOAD:/usr/lib/aarch64-linux-gnu/libtcmalloc.so" +export LD_PRELOAD="$LD_PRELOAD:" + +# Verify your configuration +# The path of libtcmalloc.so will be contained in the result if your configuration is valid +ldd `which python` +``` + +### 3. `torch_npu` Optimization + +Some performance tuning features in `torch_npu` are controlled by environment variables. Some features and their related environment variables are shown below. + +Memory optimization: + +```{code-block} bash + :substitutions: +# Upper limit of memory block splitting allowed (MB), Setting this parameter can prevent large memory blocks from being split. +export PYTORCH_NPU_ALLOC_CONF="max_split_size_mb:250" + +# When operators on the communication stream have dependencies, they all need to be ended before being released for reuse. The logic of multi-stream reuse is to release the memory on the communication stream in advance so that the computing stream can be reused. +export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" +``` + +Schedule optimization: + +```{code-block} bash + :substitutions: +# Optimize operator delivery queue, this will affect the memory peak value, and may degrade if the memory is tight. +export TASK_QUEUE_ENABLE=2 + +# This will greatly improve the CPU bottleneck model and ensure the same performance for the NPU bottleneck model. +export CPU_AFFINITY_CONF=1 +``` + +### 4. CANN Optimization + +#### 4.1. HCCL Optimization + +There are some performance tuning features in HCCL, which are controlled by environment variables. + +You can configure HCCL to use "AIV" mode to optimize performance by setting the environment variable shown below. In "AIV" mode, the communication is scheduled by AI vector core directly with ROCE, instead of being scheduled by AI cpu. + +```{code-block} bash + :substitutions: +export HCCL_OP_EXPANSION_MODE="AIV" +``` + +Plus, there are more features for performance optimization in specific scenarios, which are shown below. + +- `HCCL_INTRA_ROCE_ENABLE`: Use RDMA link instead of SDMA link between two 8Ps as the mesh interconnect link, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0044.html). +- `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html). +- `HCCL_RDMA_SL`: Use this var to configure service level of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html). +- `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html). diff --git a/docs/source/developer_guide/performance/performance_benchmark.md b/docs/source/developer_guide/performance/performance_benchmark.md new file mode 100644 index 0000000..e08d769 --- /dev/null +++ b/docs/source/developer_guide/performance/performance_benchmark.md @@ -0,0 +1,194 @@ +# Performance Benchmark +This document details the benchmark methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. To maintain alignment with vLLM, we use the [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script provided by the vllm project. + +**Benchmark Coverage**: We measure offline e2e latency and throughput, and fixed-QPS online serving benchmarks, for more details see [vllm-ascend benchmark scripts](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks). + +## 1. Run docker container + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +/bin/bash +``` + +## 2. Install dependencies + +```bash +cd /workspace/vllm-ascend +pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +pip install -r benchmarks/requirements-bench.txt +``` + +## 3. (Optional)Prepare model weights +For faster running speed, we recommend downloading the model in advance: + +```bash +modelscope download --model LLM-Research/Meta-Llama-3.1-8B-Instruct +``` + +You can also replace all model paths in the [json](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests) files with your local paths: + +```bash +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "your local model path", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] +``` + +## 4. Run benchmark script +Run benchmark script: + +```bash +bash benchmarks/scripts/run-performance-benchmarks.sh +``` + +After about 10 mins, the output is as shown below: + +```bash +online serving: +qps 1: +============ Serving Benchmark Result ============ +Successful requests: 200 +Benchmark duration (s): 212.77 +Total input tokens: 42659 +Total generated tokens: 43545 +Request throughput (req/s): 0.94 +Output token throughput (tok/s): 204.66 +Total Token throughput (tok/s): 405.16 +---------------Time to First Token---------------- +Mean TTFT (ms): 104.14 +Median TTFT (ms): 102.22 +P99 TTFT (ms): 153.82 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 38.78 +Median TPOT (ms): 38.70 +P99 TPOT (ms): 48.03 +---------------Inter-token Latency---------------- +Mean ITL (ms): 38.46 +Median ITL (ms): 36.96 +P99 ITL (ms): 75.03 +================================================== + +qps 4: +============ Serving Benchmark Result ============ +Successful requests: 200 +Benchmark duration (s): 72.55 +Total input tokens: 42659 +Total generated tokens: 43545 +Request throughput (req/s): 2.76 +Output token throughput (tok/s): 600.24 +Total Token throughput (tok/s): 1188.27 +---------------Time to First Token---------------- +Mean TTFT (ms): 115.62 +Median TTFT (ms): 109.39 +P99 TTFT (ms): 169.03 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 51.48 +Median TPOT (ms): 52.40 +P99 TPOT (ms): 69.41 +---------------Inter-token Latency---------------- +Mean ITL (ms): 50.47 +Median ITL (ms): 43.95 +P99 ITL (ms): 130.29 +================================================== + +qps 16: +============ Serving Benchmark Result ============ +Successful requests: 200 +Benchmark duration (s): 47.82 +Total input tokens: 42659 +Total generated tokens: 43545 +Request throughput (req/s): 4.18 +Output token throughput (tok/s): 910.62 +Total Token throughput (tok/s): 1802.70 +---------------Time to First Token---------------- +Mean TTFT (ms): 128.50 +Median TTFT (ms): 128.36 +P99 TTFT (ms): 187.87 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 83.60 +Median TPOT (ms): 77.85 +P99 TPOT (ms): 165.90 +---------------Inter-token Latency---------------- +Mean ITL (ms): 65.72 +Median ITL (ms): 54.84 +P99 ITL (ms): 289.63 +================================================== + +qps inf: +============ Serving Benchmark Result ============ +Successful requests: 200 +Benchmark duration (s): 41.26 +Total input tokens: 42659 +Total generated tokens: 43545 +Request throughput (req/s): 4.85 +Output token throughput (tok/s): 1055.44 +Total Token throughput (tok/s): 2089.40 +---------------Time to First Token---------------- +Mean TTFT (ms): 3394.37 +Median TTFT (ms): 3359.93 +P99 TTFT (ms): 3540.93 +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): 66.28 +Median TPOT (ms): 64.19 +P99 TPOT (ms): 97.66 +---------------Inter-token Latency---------------- +Mean ITL (ms): 56.62 +Median ITL (ms): 55.69 +P99 ITL (ms): 82.90 +================================================== + +offline: +latency: +Avg latency: 4.944929537673791 seconds +10% percentile latency: 4.894104263186454 seconds +25% percentile latency: 4.909652255475521 seconds +50% percentile latency: 4.932477846741676 seconds +75% percentile latency: 4.9608619548380375 seconds +90% percentile latency: 5.035418218374252 seconds +99% percentile latency: 5.052476694583893 seconds + +throughput: +Throughput: 4.64 requests/s, 2000.51 total tokens/s, 1010.54 output tokens/s +Total num prompt tokens: 42659 +Total num output tokens: 43545 +``` + +The result json files are generated into the path `benchmark/results` +These files contain detailed benchmarking results for further analysis. + +```bash +. +|-- latency_llama8B_tp1.json +|-- serving_llama8B_tp1_qps_1.json +|-- serving_llama8B_tp1_qps_16.json +|-- serving_llama8B_tp1_qps_4.json +|-- serving_llama8B_tp1_qps_inf.json +`-- throughput_llama8B_tp1.json +``` diff --git a/docs/source/developer_guide/performance/profile_execute_duration.md b/docs/source/developer_guide/performance/profile_execute_duration.md new file mode 100644 index 0000000..5c43017 --- /dev/null +++ b/docs/source/developer_guide/performance/profile_execute_duration.md @@ -0,0 +1,40 @@ +# Profile Execute Duration + +The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization. + +**To reduce the performance overhead, we add this feature, using the NPU event timestamp mechanism to observe the device execution time asynchronously.** + +## Usage +* Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to enable this feature. +* Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration. +* Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages. + +**We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:** + +``` +VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py +``` + +## Example Output + +``` +5691:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.17ms [prepare input and forward]:9.57ms [forward]:4.14ms +5695:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.29ms [prepare input and forward]:10.19ms [forward]:4.14ms +5697:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.81ms [prepare input and forward]:10.29ms [forward]:3.99ms +5701:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.10ms [prepare input and forward]:10.62ms [forward]:4.33ms +5705:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.65ms [prepare input and forward]:9.58ms [forward]:4.20ms +5709:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.43ms [prepare input and forward]:9.88ms [forward]:4.20ms +5711:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.89ms [prepare input and forward]:10.49ms [forward]:4.19ms +5715:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.14ms [prepare input and forward]:11.21ms [forward]:4.18ms +5719:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.71ms [prepare input and forward]:10.15ms [forward]:4.42ms +5723:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.31ms [forward]:4.25ms +5725:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.12ms [prepare input and forward]:10.33ms [forward]:4.24ms +5729:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.58ms [prepare input and forward]:10.85ms [forward]:4.32ms +5733:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.32ms [prepare input and forward]:9.79ms [forward]:4.28ms +5737:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:15.06ms [prepare input and forward]:9.89ms [forward]:4.32ms +5739:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.48ms [forward]:4.27ms +5743:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.60ms [prepare input and forward]:10.71ms [forward]:4.61ms +5747:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.21ms [prepare input and forward]:10.10ms [forward]:4.52ms +5751:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:15.03ms [prepare input and forward]:10.00ms [forward]:4.42ms + +``` diff --git a/docs/source/faqs.md b/docs/source/faqs.md new file mode 100644 index 0000000..c0a3f0d --- /dev/null +++ b/docs/source/faqs.md @@ -0,0 +1,198 @@ +# FAQs + +## Version Specific FAQs + +- [[v0.9.1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2643) +- [[v0.10.1rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2630) + +## General FAQs + +### 1. What devices are currently supported? + +Currently, **ONLY** Atlas A2 series(Ascend-cann-kernels-910b),Atlas A3 series(Atlas-A3-cann-kernels) and Atlas 300I(Ascend-cann-kernels-310p) series are supported: + +- Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2) +- Atlas 800I A2 Inference series (Atlas 800I A2) +- Atlas A3 Training series (Atlas 800T A3, Atlas 900 A3 SuperPoD, Atlas 9000 A3 SuperPoD) +- Atlas 800I A3 Inference series (Atlas 800I A3) +- [Experimental] Atlas 300I Inference series (Atlas 300I Duo) + +Below series are NOT supported yet: +- Atlas 200I A2 (Ascend-cann-kernels-310b) unplanned yet +- Ascend 910, Ascend 910 Pro B (Ascend-cann-kernels-910) unplanned yet + +From a technical view, vllm-ascend support would be possible if the torch-npu is supported. Otherwise, we have to implement it by using custom ops. We are also welcome to join us to improve together. + +### 2. How to get our docker containers? + +You can get our containers at `Quay.io`, e.g., [vllm-ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and [cann](https://quay.io/repository/ascend/cann?tab=tags). + +If you are in China, you can use `daocloud` to accelerate your downloading: + +```bash +# Replace with tag you want to pull +TAG=v0.7.3rc2 +docker pull m.daocloud.io/quay.io/ascend/vllm-ascend:$TAG +``` + +#### Load Docker Images for offline environment +If you want to use container image for offline environments (no internet connection), you need to download container image in a environment with internet access: + +**Exporting Docker images:** + +```{code-block} bash + :substitutions: +# Pull the image on a machine with internet access +TAG=|vllm_ascend_version| +docker pull quay.io/ascend/vllm-ascend:$TAG + +# Export the image to a tar file and compress to tar.gz +docker save quay.io/ascend/vllm-ascend:$TAG | gzip > vllm-ascend-$TAG.tar.gz +``` + +**Importing Docker images in environment without internet access:** + +```{code-block} bash + :substitutions: +# Transfer the tar/tar.gz file to the offline environment and load it +TAG=|vllm_ascend_version| +docker load -i vllm-ascend-$TAG.tar.gz + +# Verify the image is loaded +docker images | grep vllm-ascend +``` + +### 3. What models does vllm-ascend supports? + +Find more details [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html). + +### 4. How to get in touch with our community? + +There are many channels that you can communicate with our community developers / users: + +- Submit a GitHub [issue](https://github.com/vllm-project/vllm-ascend/issues?page=1). +- Join our [weekly meeting](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z) and share your ideas. +- Join our [WeChat](https://github.com/vllm-project/vllm-ascend/issues/227) group and ask your quenstions. +- Join our ascend channel in [vLLM forums](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support/6) and publish your topics. + +### 5. What features does vllm-ascend V1 supports? + +Find more details [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html). + +### 6. How to solve the problem of "Failed to infer device type" or "libatb.so: cannot open shared object file"? + +Basically, the reason is that the NPU environment is not configured correctly. You can: +1. try `source /usr/local/Ascend/nnal/atb/set_env.sh` to enable NNAL package. +2. try `source /usr/local/Ascend/ascend-toolkit/set_env.sh` to enable CANN package. +3. try `npu-smi info` to check whether the NPU is working. + +If all above steps are not working, you can try the following code with python to check whether there is any error: + +``` +import torch +import torch_npu +import vllm +``` + +If all above steps are not working, feel free to submit a GitHub issue. + +### 7. How does vllm-ascend perform? + +Currently, only some models are improved. Such as `Qwen2.5 VL`, `Qwen3`, `Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well. + +### 8. How vllm-ascend work with vllm? +vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit. + +### 9. Does vllm-ascend support Prefill Disaggregation feature? + +Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future. + +### 10. Does vllm-ascend support quantization method? + +Currently, w8a8 quantization is already supported by vllm-ascend originally on v0.8.4rc2 or higher, If you're using vllm 0.7.3 version, w8a8 quantization is supporeted with the integration of vllm-ascend and mindie-turbo, please use `pip install vllm-ascend[mindie-turbo]`. + +### 11. How to run w8a8 DeepSeek model? + +Please following the [inferencing tutorail](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html) and replace model to DeepSeek. + +### 12. There is no output in log when loading models using vllm-ascend, How to solve it? + +If you're using vllm 0.7.3 version, this is a known progress bar display issue in VLLM, which has been resolved in [this PR](https://github.com/vllm-project/vllm/pull/12428), please cherry-pick it locally by yourself. Otherwise, please fill up an issue. + +### 13. How vllm-ascend is tested + +vllm-ascend is tested by functional test, performance test and accuracy test. + +- **Functional test**: we added CI, includes portion of vllm's native unit tests and vllm-ascend's own unit tests,on vllm-ascend's test, we test basic functionality、popular models availability and [supported features](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html) via e2e test + +- **Performance test**: we provide [benchmark](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks) tools for end-to-end performance benchmark which can easily to re-route locally, we'll publish a perf website to show the performance test results for each pull request + +- **Accuracy test**: we're working on adding accuracy test to CI as well. + +Finnall, for each release, we'll publish the performance test and accuracy test report in the future. + +### 14. How to fix the error "InvalidVersion" when using vllm-ascend? +It's usually because you have installed an dev/editable version of vLLM package. In this case, we provide the env variable `VLLM_VERSION` to let users specify the version of vLLM package to use. Please set the env variable `VLLM_VERSION` to the version of vLLM package you have installed. The format of `VLLM_VERSION` should be `X.Y.Z`. + +### 15. How to handle Out Of Memory? +OOM errors typically occur when the model exceeds the memory capacity of a single NPU. For general guidance, you can refer to [vLLM's OOM troubleshooting documentation](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-of-memory). + +In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynamic memory allocation/deallocation during inference can exacerbate memory fragmentation, leading to OOM. To address this: + +- **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig). + +- **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html). + +### 16. Failed to enable NPU graph mode when running DeepSeek? +You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future. + +And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}. + +```bash +[rank0]: RuntimeError: EZ9999: Inner Error! +[rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218] +``` + +### 17. Failed to reinstall vllm-ascend from source after uninstalling vllm-ascend? +You may encounter the problem of C compilation failure when reinstalling vllm-ascend from source using pip. If the installation fails, it is recommended to use `python setup.py install` to install, or use `python setup.py clean` to clear the cache. + +### 18. How to generate determinitic results when using vllm-ascend? +There are several factors that affect output certainty: + +1. Sampler Method: using **Greedy sample** by setting `temperature=0` in `SamplingParams`, e.g.: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0) +# Create an LLM. +llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +2. Set the following enveriments parameters: + +```bash +export LCCL_DETERMINISTIC=1 +export HCCL_DETERMINISTIC=true +export ATB_MATMUL_SHUFFLE_K_ENABLE=0 +export ATB_LLM_LCOC_ENABLE=0 +``` + +### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model? +The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`, +this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly. diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..fafeb40 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,71 @@ +# Welcome to vLLM Ascend Plugin + +:::{figure} ./logos/vllm-ascend-logo-text-light.png +:align: center +:alt: vLLM +:class: no-scaled-link +:width: 70% +::: + +:::{raw} html +

+vLLM Ascend Plugin + +

+ +

+ +Star +Watch +Fork +

+::: + +vLLM Ascend plugin (vllm-ascend) is a community maintained hardware plugin for running vLLM on the Ascend NPU. + +This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM. + +By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU. + +## Documentation + +% How to start using vLLM on Ascend NPU? +:::{toctree} +:caption: Getting Started +:maxdepth: 1 +quick_start +installation +tutorials/index.md +faqs +::: + +% What does vLLM Ascend Plugin support? +:::{toctree} +:caption: User Guide +:maxdepth: 1 +user_guide/support_matrix/index +user_guide/configuration/index +user_guide/feature_guide/index +user_guide/release_notes +::: + +% How to contribute to the vLLM Ascend project +:::{toctree} +:caption: Developer Guide +:maxdepth: 1 +developer_guide/contribution/index +developer_guide/feature_guide/index +developer_guide/evaluation/index +developer_guide/performance/index +developer_guide/modeling/index +::: + +% How to involve vLLM Ascend +:::{toctree} +:caption: Community +:maxdepth: 1 +community/governance +community/contributors +community/versioning_policy +community/user_stories/index +::: diff --git a/docs/source/installation.md b/docs/source/installation.md new file mode 100644 index 0000000..b06777e --- /dev/null +++ b/docs/source/installation.md @@ -0,0 +1,282 @@ +# Installation + +This document describes how to install vllm-ascend manually. + +## Requirements + +- OS: Linux +- Python: >= 3.9, < 3.12 +- A hardware with Ascend NPU. It's usually the Atlas 800 A2 series. +- Software: + + | Software | Supported version | Note | + |---------------|----------------------------------|-------------------------------------------| + | CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu | + | torch-npu | >= 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | + | torch | >= 2.7.1 | Required for torch-npu and vllm | + +You have 2 way to install: +- **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip. +- **Using docker**: use the `vllm-ascend` pre-built docker image directly. + +## Configure a new environment + +Before installing, you need to make sure firmware/driver and CANN are installed correctly, refer to [link](https://ascend.github.io/docs/sources/ascend/quick_install.html) for more details. + +### Configure hardware environment + +To verify that the Ascend NPU firmware and driver were correctly installed, run: + +```bash +npu-smi info +``` + +Refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) for more details. + +### Configure software environment + +:::::{tab-set} +:sync-group: install + +::::{tab-item} Before using pip +:selected: +:sync: pip + +The easiest way to prepare your software environment is using CANN image directly: + +```{code-block} bash + :substitutions: +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/cann:|cann_image_tag| +docker run --rm \ + --name vllm-ascend-env \ + --device $DEVICE \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/.cache:/root/.cache \ + -it $IMAGE bash +``` + +:::{dropdown} Click here to see "Install CANN manually" +:animate: fade-in-slide-down +You can also install CANN manually: + +```bash +# Create a virtual environment +python -m venv vllm-ascend-env +source vllm-ascend-env/bin/activate + +# Install required python packages. +pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions + +# Download and install the CANN package. +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run +./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run --full +# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-aarch64.run + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run +./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run --install + +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run +./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run --install + +source /usr/local/Ascend/nnal/atb/set_env.sh +``` + +::: + +:::: + +::::{tab-item} Before using docker +:sync: docker +No more extra step if you are using `vllm-ascend` prebuilt docker image. +:::: +::::: + +Once it's done, you can start to set up `vllm` and `vllm-ascend`. + +## Setup vllm and vllm-ascend + +:::::{tab-set} +:sync-group: install + +::::{tab-item} Using pip +:selected: +:sync: pip + +First install system dependencies and config pip mirror: + +```bash +# Using apt-get with mirror +sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list +apt-get update -y && apt-get install -y gcc g++ cmake libnuma-dev wget git curl jq +# Or using yum +# yum update -y && yum install -y gcc g++ cmake numactl-devel wget git curl jq +# Config pip mirror +pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +``` + +**[Optional]** Then config the extra-index of `pip` if you are working on a x86 machine or using torch-npu dev version: + +```bash +# For torch-npu dev version or x86 machine +pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi" +``` + +Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**: + +```{code-block} bash + :substitutions: + +# Install vllm-project/vllm from pypi +pip install vllm==|pip_vllm_version| + +# Install vllm-project/vllm-ascend from pypi. +pip install vllm-ascend==|pip_vllm_ascend_version| +``` + +:::{dropdown} Click here to see "Build from source code" +or build from **source code**: + +```{code-block} bash + :substitutions: + +# Install vLLM +git clone --depth 1 --branch |vllm_version| https://github.com/vllm-project/vllm +cd vllm +VLLM_TARGET_DEVICE=empty pip install -v -e . +cd .. + +# Install vLLM Ascend +git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -v -e . +cd .. +``` + +vllm-ascend will build custom ops by default. If you don't want to build it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it. +::: + +```{note} +If you are building from v0.7.3-dev and intend to use sleep mode feature, you should set `COMPILE_CUSTOM_KERNELS=1` manually. +To build custom ops, gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using `pip install -e .` and encounter a torch-npu version conflict, please install with `pip install --no-build-isolation -e .` to build on system env. +If you encounter other problems during compiling, it is probably because unexpected compiler is being used, you may export `CXX_COMPILER` and `C_COMPILER` in env to specify your g++ and gcc locations before compiling. +``` + +:::: + +::::{tab-item} Using docker +:sync: docker + +You can just pull the **prebuilt image** and run it with bash. + +:::{dropdown} Click here to see "Build from Dockerfile" +or build IMAGE from **source code**: + +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +docker build -t vllm-ascend-dev-image:latest -f ./Dockerfile . +``` + +::: + +```{code-block} bash + :substitutions: + +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci7 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ + --name vllm-ascend-env \ + --device $DEVICE \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /root/.cache:/root/.cache \ + -it $IMAGE bash +``` + +The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation. +:::: + +::::: + +## Extra information + +### Verify installation + +Create and run a simple inference test. The `example.py` can be like: + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# Create an LLM. +llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +Then run: + +```bash +# Try `export VLLM_USE_MODELSCOPE=true` and `pip install modelscope` +# to speed up download if huggingface is not reachable. +python example.py +``` + +The output will be like: + +```bash +INFO 02-18 08:49:58 __init__.py:28] Available plugins for group vllm.platform_plugins: +INFO 02-18 08:49:58 __init__.py:30] name=ascend, value=vllm_ascend:register +INFO 02-18 08:49:58 __init__.py:32] all available plugins for group vllm.platform_plugins will be loaded. +INFO 02-18 08:49:58 __init__.py:34] set environment variable VLLM_PLUGINS to control which plugins to load. +INFO 02-18 08:49:58 __init__.py:42] plugin ascend loaded. +INFO 02-18 08:49:58 __init__.py:174] Platform plugin ascend is activated +INFO 02-18 08:50:12 config.py:526] This model supports multiple tasks: {'embed', 'classify', 'generate', 'score', 'reward'}. Defaulting to 'generate'. +INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='./Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen2.5-0.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, +Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00, 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../community/contributors.md:1 +msgid "Maintainers and contributors" +msgstr "维护者和贡献者" + +#: ../../community/contributors.md:3 +msgid "Maintainers" +msgstr "维护者" + +#: ../../community/contributors.md +msgid "Name" +msgstr "名称" + +#: ../../community/contributors.md +msgid "Github ID" +msgstr "Github 账号" + +#: ../../community/contributors.md +msgid "Date" +msgstr "日期" + +#: ../../community/contributors.md +msgid "Xiyuan Wang" +msgstr "Xiyuan Wang" + +#: ../../community/contributors.md +msgid "[@wangxiyuan](https://github.com/wangxiyuan)" +msgstr "[@wangxiyuan](https://github.com/wangxiyuan)" + +#: ../../community/contributors.md +msgid "2025/01" +msgstr "2025/01" + +#: ../../community/contributors.md +msgid "Yikun Jiang" +msgstr "Yikun Jiang" + +#: ../../community/contributors.md +msgid "[@Yikun](https://github.com/Yikun)" +msgstr "[@Yikun](https://github.com/Yikun)" + +#: ../../community/contributors.md +msgid "2025/02" +msgstr "2025/02" + +#: ../../community/contributors.md +msgid "Yi Gan" +msgstr "Yi Gan" + +#: ../../community/contributors.md +msgid "[@ganyi1996ppo](https://github.com/ganyi1996ppo)" +msgstr "[@ganyi1996ppo](https://github.com/ganyi1996ppo)" + +#: ../../community/contributors.md +msgid "Shoujian Zheng" +msgstr "Shoujian Zheng" + +#: ../../community/contributors.md +msgid "[@jianzs](https://github.com/jianzs)" +msgstr "[@jianzs](https://github.com/jianzs)" + +#: ../../community/contributors.md +msgid "2025/06" +msgstr "2025/06" + +#: ../../community/contributors.md:12 +msgid "Contributors" +msgstr "贡献者" + +#: ../../community/contributors.md:14 +msgid "" +"vLLM Ascend every release would not have been possible without the following" +" contributors:" +msgstr "每个 vLLM Ascend 版本的发布都离不开以下贡献者:" + +#: ../../community/contributors.md:16 +msgid "Updated on 2025-06-10:" +msgstr "更新于 2025-06-10:" + +#: ../../community/contributors.md +msgid "Number" +msgstr "数字" + +#: ../../community/contributors.md +msgid "Contributor" +msgstr "贡献者" + +#: ../../community/contributors.md +msgid "Commit ID" +msgstr "提交 ID" + +#: ../../community/contributors.md +msgid "83" +msgstr "83" + +#: ../../community/contributors.md +msgid "[@ZhengWG](https://github.com/)" +msgstr "[@ZhengWG](https://github.com/)" + +#: ../../community/contributors.md +msgid "2025/7/7" +msgstr "2025/7/7" + +#: ../../community/contributors.md +msgid "" +"[3a469de](https://github.com/vllm-project/vllm-" +"ascend/commit/9c886d0a1f0fc011692090b0395d734c83a469de)" +msgstr "" +"[3a469de](https://github.com/vllm-project/vllm-" +"ascend/commit/9c886d0a1f0fc011692090b0395d734c83a469de)" + +#: ../../community/contributors.md +msgid "82" +msgstr "82" + +#: ../../community/contributors.md +msgid "[@wm901115nwpu](https://github.com/)" +msgstr "[@wm901115nwpu](https://github.com/)" + +#: ../../community/contributors.md +msgid "" +"[a2a47d4](https://github.com/vllm-project/vllm-" +"ascend/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4)" +msgstr "" +"[a2a47d4](https://github.com/vllm-project/vllm-" +"ascend/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4)" + +#: ../../community/contributors.md +msgid "81" +msgstr "81" + +#: ../../community/contributors.md +msgid "[@Agonixiaoxiao](https://github.com/)" +msgstr "[@Agonixiaoxiao](https://github.com/)" + +#: ../../community/contributors.md +msgid "2025/7/2" +msgstr "2025/7/2" + +#: ../../community/contributors.md +msgid "" +"[6f84576](https://github.com/vllm-project/vllm-" +"ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576)" +msgstr "" +"[6f84576](https://github.com/vllm-project/vllm-" +"ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576)" + +#: ../../community/contributors.md +msgid "80" +msgstr "80" + +#: ../../community/contributors.md +msgid "[@zhanghw0354](https://github.com/zhanghw0354)" +msgstr "[@zhanghw0354](https://github.com/zhanghw0354)" + +#: ../../community/contributors.md +msgid "" +"[d3df9a5](https://github.com/vllm-project/vllm-" +"ascend/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5)" +msgstr "" +"[d3df9a5](https://github.com/vllm-project/vllm-" +"ascend/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5)" + +#: ../../community/contributors.md +msgid "79" +msgstr "79" + +#: ../../community/contributors.md +msgid "[@GDzhu01](https://github.com/GDzhu01)" +msgstr "[@GDzhu01](https://github.com/GDzhu01)" + +#: ../../community/contributors.md +msgid "2025/6/28" +msgstr "2025/6/28" + +#: ../../community/contributors.md +msgid "" +"[de256ac](https://github.com/vllm-project/vllm-" +"ascend/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac)" +msgstr "" +"[de256ac](https://github.com/vllm-project/vllm-" +"ascend/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac)" + +#: ../../community/contributors.md +msgid "78" +msgstr "78" + +#: ../../community/contributors.md +msgid "[@leo-pony](https://github.com/leo-pony)" +msgstr "[@leo-pony](https://github.com/leo-pony)" + +#: ../../community/contributors.md +msgid "2025/6/26" +msgstr "2025/6/26" + +#: ../../community/contributors.md +msgid "" +"[3f2a5f2](https://github.com/vllm-project/vllm-" +"ascend/commit/10253449120307e3b45f99d82218ba53e3f2a5f2)" +msgstr "" +"[3f2a5f2](https://github.com/vllm-project/vllm-" +"ascend/commit/10253449120307e3b45f99d82218ba53e3f2a5f2)" + +#: ../../community/contributors.md +msgid "77" +msgstr "77" + +#: ../../community/contributors.md +msgid "[@zeshengzong](https://github.com/zeshengzong)" +msgstr "[@zeshengzong](https://github.com/zeshengzong)" + +#: ../../community/contributors.md +msgid "" +"[3ee25aa](https://github.com/vllm-project/vllm-" +"ascend/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa)" +msgstr "" +"[3ee25aa](https://github.com/vllm-project/vllm-" +"ascend/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa)" + +#: ../../community/contributors.md +msgid "76" +msgstr "76" + +#: ../../community/contributors.md +msgid "[@sharonyunyun](https://github.com/sharonyunyun)" +msgstr "[@sharonyunyun](https://github.com/sharonyunyun)" + +#: ../../community/contributors.md +msgid "2025/6/25" +msgstr "2025/6/25" + +#: ../../community/contributors.md +msgid "" +"[2dd8666](https://github.com/vllm-project/vllm-" +"ascend/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666)" +msgstr "" +"[2dd8666](https://github.com/vllm-project/vllm-" +"ascend/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666)" + +#: ../../community/contributors.md +msgid "75" +msgstr "75" + +#: ../../community/contributors.md +msgid "[@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee)" +msgstr "[@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee)" + +#: ../../community/contributors.md +msgid "" +"[c65dd40](https://github.com/vllm-project/vllm-" +"ascend/commit/2fda60464c287fe456b4a2f27e63996edc65dd40)" +msgstr "" +"[c65dd40](https://github.com/vllm-project/vllm-" +"ascend/commit/2fda60464c287fe456b4a2f27e63996edc65dd40)" + +#: ../../community/contributors.md +msgid "74" +msgstr "74" + +#: ../../community/contributors.md +msgid "[@xleoken](https://github.com/xleoken)" +msgstr "[@xleoken](https://github.com/xleoken)" + +#: ../../community/contributors.md +msgid "2025/6/23" +msgstr "2025/6/23" + +#: ../../community/contributors.md +msgid "" +"[c604de0](https://github.com/vllm-project/vllm-" +"ascend/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0)" +msgstr "" +"[c604de0](https://github.com/vllm-project/vllm-" +"ascend/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0)" + +#: ../../community/contributors.md +msgid "73" +msgstr "73" + +#: ../../community/contributors.md +msgid "[@lyj-jjj](https://github.com/lyj-jjj)" +msgstr "[@lyj-jjj](https://github.com/lyj-jjj)" + +#: ../../community/contributors.md +msgid "" +"[5cbd74e](https://github.com/vllm-project/vllm-" +"ascend/commit/5177bef87a21331dcca11159d3d1438075cbd74e)" +msgstr "" +"[5cbd74e](https://github.com/vllm-project/vllm-" +"ascend/commit/5177bef87a21331dcca11159d3d1438075cbd74e)" + +#: ../../community/contributors.md +msgid "72" +msgstr "72" + +#: ../../community/contributors.md +msgid "[@farawayboat](https://github.com/farawayboat)" +msgstr "[@farawayboat](https://github.com/farawayboat)" + +#: ../../community/contributors.md +msgid "2025/6/21" +msgstr "2025/6/21" + +#: ../../community/contributors.md +msgid "" +"[bc7d392](https://github.com/vllm-project/vllm-" +"ascend/commit/097e7149f75c0806774bc68207f0f6270bc7d392)" +msgstr "" +"[bc7d392](https://github.com/vllm-project/vllm-" +"ascend/commit/097e7149f75c0806774bc68207f0f6270bc7d392)" + +#: ../../community/contributors.md +msgid "71" +msgstr "71" + +#: ../../community/contributors.md +msgid "[@yuancaoyaoHW](https://github.com/yuancaoyaoHW)" +msgstr "[@yuancaoyaoHW](https://github.com/yuancaoyaoHW)" + +#: ../../community/contributors.md +msgid "2025/6/20" +msgstr "2025/6/20" + +#: ../../community/contributors.md +msgid "" +"[7aa0b94](https://github.com/vllm-project/vllm-" +"ascend/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94)" +msgstr "" +"[7aa0b94](https://github.com/vllm-project/vllm-" +"ascend/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94)" + +#: ../../community/contributors.md +msgid "70" +msgstr "70" + +#: ../../community/contributors.md +msgid "[@songshanhu07](https://github.com/songshanhu07)" +msgstr "[@songshanhu07](https://github.com/songshanhu07)" + +#: ../../community/contributors.md +msgid "2025/6/18" +msgstr "2025/6/18" + +#: ../../community/contributors.md +msgid "" +"[5e1de1f](https://github.com/vllm-project/vllm-" +"ascend/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f)" +msgstr "" +"[5e1de1f](https://github.com/vllm-project/vllm-" +"ascend/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f)" + +#: ../../community/contributors.md +msgid "69" +msgstr "69" + +#: ../../community/contributors.md +msgid "[@wangyanhui-cmss](https://github.com/wangyanhui-cmss)" +msgstr "[@wangyanhui-cmss](https://github.com/wangyanhui-cmss)" + +#: ../../community/contributors.md +msgid "2025/6/12" +msgstr "2025/6/12" + +#: ../../community/contributors.md +msgid "" +"[40c9e88](https://github.com/vllm-project/vllm-" +"ascend/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88)" +msgstr "" +"[40c9e88](https://github.com/vllm-project/vllm-" +"ascend/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88)" + +#: ../../community/contributors.md +msgid "68" +msgstr "68" + +#: ../../community/contributors.md +msgid "[@chenwaner](https://github.com/chenwaner)" +msgstr "[@chenwaner](https://github.com/chenwaner)" + +#: ../../community/contributors.md +msgid "2025/6/11" +msgstr "2025/6/11" + +#: ../../community/contributors.md +msgid "" +"[c696169](https://github.com/vllm-project/vllm-" +"ascend/commit/e46dc142bf1180453c64226d76854fc1ec696169)" +msgstr "" +"[c696169](https://github.com/vllm-project/vllm-" +"ascend/commit/e46dc142bf1180453c64226d76854fc1ec696169)" + +#: ../../community/contributors.md +msgid "67" +msgstr "67" + +#: ../../community/contributors.md +msgid "[@yzim](https://github.com/yzim)" +msgstr "[@yzim](https://github.com/yzim)" + +#: ../../community/contributors.md +msgid "" +"[aaf701b](https://github.com/vllm-project/vllm-" +"ascend/commit/4153a5091b698c2270d160409e7fee73baaf701b)" +msgstr "" +"[aaf701b](https://github.com/vllm-project/vllm-" +"ascend/commit/4153a5091b698c2270d160409e7fee73baaf701b)" + +#: ../../community/contributors.md +msgid "66" +msgstr "66" + +#: ../../community/contributors.md +msgid "[@Yuxiao-Xu](https://github.com/Yuxiao-Xu)" +msgstr "[@Yuxiao-Xu](https://github.com/Yuxiao-Xu)" + +#: ../../community/contributors.md +msgid "2025/6/9" +msgstr "2025/6/9" + +#: ../../community/contributors.md +msgid "" +"[6b853f1](https://github.com/vllm-project/vllm-" +"ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505)" +msgstr "" +"[6b853f1](https://github.com/vllm-project/vllm-" +"ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505)" + +#: ../../community/contributors.md +msgid "65" +msgstr "65" + +#: ../../community/contributors.md +msgid "[@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU)" +msgstr "[@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU)" + +#: ../../community/contributors.md +msgid "2025/6/7" +msgstr "2025/6/7" + +#: ../../community/contributors.md +msgid "" +"[20dedba](https://github.com/vllm-project/vllm-" +"ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193)" +msgstr "" +"[20dedba](https://github.com/vllm-project/vllm-" +"ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193)" + +#: ../../community/contributors.md +msgid "64" +msgstr "64" + +#: ../../community/contributors.md +msgid "[@zxdukki](https://github.com/zxdukki)" +msgstr "[@zxdukki](https://github.com/zxdukki)" + +#: ../../community/contributors.md +msgid "" +"[87ebaef](https://github.com/vllm-project/vllm-" +"ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf)" +msgstr "" +"[87ebaef](https://github.com/vllm-project/vllm-" +"ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf)" + +#: ../../community/contributors.md +msgid "63" +msgstr "63" + +#: ../../community/contributors.md +msgid "[@sdmyzlp](https://github.com/sdmyzlp)" +msgstr "[@sdmyzlp](https://github.com/sdmyzlp)" + +#: ../../community/contributors.md +msgid "" +"[3640c60](https://github.com/vllm-project/vllm-" +"ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7)" +msgstr "" +"[3640c60](https://github.com/vllm-project/vllm-" +"ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7)" + +#: ../../community/contributors.md +msgid "62" +msgstr "62" + +#: ../../community/contributors.md +msgid "[@weijinqian0](https://github.com/weijinqian0)" +msgstr "[@weijinqian0](https://github.com/weijinqian0)" + +#: ../../community/contributors.md +msgid "" +"[e9ada68](https://github.com/vllm-project/vllm-" +"ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b)" +msgstr "" +"[e9ada68](https://github.com/vllm-project/vllm-" +"ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b)" + +#: ../../community/contributors.md +msgid "61" +msgstr "61" + +#: ../../community/contributors.md +msgid "[@hahazhky](https://github.com/hahazhky)" +msgstr "[@hahazhky](https://github.com/hahazhky)" + +#: ../../community/contributors.md +msgid "2025/6/6" +msgstr "2025/6/6" + +#: ../../community/contributors.md +msgid "" +"[0b12c2a](https://github.com/vllm-project/vllm-" +"ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435)" +msgstr "" +"[0b12c2a](https://github.com/vllm-project/vllm-" +"ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435)" + +#: ../../community/contributors.md +msgid "60" +msgstr "60" + +#: ../../community/contributors.md +msgid "[@depeng1994](https://github.com/depeng1994)" +msgstr "[@depeng1994](https://github.com/depeng1994)" + +#: ../../community/contributors.md +msgid "" +"[6b094a2](https://github.com/vllm-project/vllm-" +"ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f)" +msgstr "" +"[6b094a2](https://github.com/vllm-project/vllm-" +"ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f)" + +#: ../../community/contributors.md +msgid "59" +msgstr "59" + +#: ../../community/contributors.md +msgid "[@David9857](https://github.com/David9857)" +msgstr "[@David9857](https://github.com/David9857)" + +#: ../../community/contributors.md +msgid "2025/6/5" +msgstr "2025/6/5" + +#: ../../community/contributors.md +msgid "" +"[78431b3](https://github.com/vllm-project/vllm-" +"ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927)" +msgstr "" +"[78431b3](https://github.com/vllm-project/vllm-" +"ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927)" + +#: ../../community/contributors.md +msgid "58" +msgstr "58" + +#: ../../community/contributors.md +msgid "[@momo609](https://github.com/momo609)" +msgstr "[@momo609](https://github.com/momo609)" + +#: ../../community/contributors.md +msgid "" +"[908a851](https://github.com/vllm-project/vllm-" +"ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7)" +msgstr "" +"[908a851](https://github.com/vllm-project/vllm-" +"ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7)" + +#: ../../community/contributors.md +msgid "57" +msgstr "57" + +#: ../../community/contributors.md +msgid "[@zhangxinyuehfad](https://github.com/zhangxinyuehfad)" +msgstr "[@zhangxinyuehfad](https://github.com/zhangxinyuehfad)" + +#: ../../community/contributors.md +msgid "" +"[7737aaa](https://github.com/vllm-project/vllm-" +"ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5)" +msgstr "" +"[7737aaa](https://github.com/vllm-project/vllm-" +"ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5)" + +#: ../../community/contributors.md +msgid "56" +msgstr "56" + +#: ../../community/contributors.md +msgid "[@NINGBENZHE](https://github.com/NINGBENZHE)" +msgstr "[@NINGBENZHE](https://github.com/NINGBENZHE)" + +#: ../../community/contributors.md +msgid "2025/6/3" +msgstr "2025/6/3" + +#: ../../community/contributors.md +msgid "" +"[6ec64a3](https://github.com/vllm-project/vllm-" +"ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099)" +msgstr "" +"[6ec64a3](https://github.com/vllm-project/vllm-" +"ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099)" + +#: ../../community/contributors.md +msgid "55" +msgstr "55" + +#: ../../community/contributors.md +msgid "[@XWFAlone](https://github.com/XWFAlone)" +msgstr "[@XWFAlone](https://github.com/XWFAlone)" + +#: ../../community/contributors.md +msgid "2025/5/30" +msgstr "2025/5/30" + +#: ../../community/contributors.md +msgid "" +"[3442fbd](https://github.com/vllm-project/vllm-" +"ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e)" +msgstr "" +"[3442fbd](https://github.com/vllm-project/vllm-" +"ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e)" + +#: ../../community/contributors.md +msgid "54" +msgstr "54" + +#: ../../community/contributors.md +msgid "[@YisongJiang](https://github.com/YisongJiang)" +msgstr "[@YisongJiang](https://github.com/YisongJiang)" + +#: ../../community/contributors.md +msgid "2025/5/29" +msgstr "2025/5/29" + +#: ../../community/contributors.md +msgid "" +"[90afaf6](https://github.com/vllm-project/vllm-" +"ascend/commit/90afaf6306f680307462becf3c78585737579851)" +msgstr "" +"[90afaf6](https://github.com/vllm-project/vllm-" +"ascend/commit/90afaf6306f680307462becf3c78585737579851)" + +#: ../../community/contributors.md +msgid "53" +msgstr "53" + +#: ../../community/contributors.md +msgid "[@ponix-j](https://github.com/ponix-j)" +msgstr "[@ponix-j](https://github.com/ponix-j)" + +#: ../../community/contributors.md +msgid "2025/5/23" +msgstr "2025/5/23" + +#: ../../community/contributors.md +msgid "" +"[df58fb8](https://github.com/vllm-project/vllm-" +"ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73)" +msgstr "" +"[df58fb8](https://github.com/vllm-project/vllm-" +"ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73)" + +#: ../../community/contributors.md +msgid "52" +msgstr "52" + +#: ../../community/contributors.md +msgid "[@ttanzhiqiang](https://github.com/ttanzhiqiang)" +msgstr "[@ttanzhiqiang](https://github.com/ttanzhiqiang)" + +#: ../../community/contributors.md +msgid "" +"[dc6172e](https://github.com/vllm-project/vllm-" +"ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40)" +msgstr "" +"[dc6172e](https://github.com/vllm-project/vllm-" +"ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40)" + +#: ../../community/contributors.md +msgid "51" +msgstr "51" + +#: ../../community/contributors.md +msgid "[@yangpuPKU](https://github.com/yangpuPKU)" +msgstr "[@yangpuPKU](https://github.com/yangpuPKU)" + +#: ../../community/contributors.md +msgid "" +"[46df67a](https://github.com/vllm-project/vllm-" +"ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1)" +msgstr "" +"[46df67a](https://github.com/vllm-project/vllm-" +"ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1)" + +#: ../../community/contributors.md +msgid "50" +msgstr "50" + +#: ../../community/contributors.md +msgid "[@wonderful199082](https://github.com/wonderful199082)" +msgstr "[@wonderful199082](https://github.com/wonderful199082)" + +#: ../../community/contributors.md +msgid "2025/5/20" +msgstr "2025/5/20" + +#: ../../community/contributors.md +msgid "" +"[5cf9ff1](https://github.com/vllm-project/vllm-" +"ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763)" +msgstr "" +"[5cf9ff1](https://github.com/vllm-project/vllm-" +"ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763)" + +#: ../../community/contributors.md +msgid "49" +msgstr "49" + +#: ../../community/contributors.md +msgid "[@22dimensions](https://github.com/22dimensions)" +msgstr "[@22dimensions](https://github.com/22dimensions)" + +#: ../../community/contributors.md +msgid "2025/5/17" +msgstr "2025/5/17" + +#: ../../community/contributors.md +msgid "" +"[a8730e7](https://github.com/vllm-project/vllm-" +"ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5)" +msgstr "" +"[a8730e7](https://github.com/vllm-project/vllm-" +"ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5)" + +#: ../../community/contributors.md +msgid "48" +msgstr "48" + +#: ../../community/contributors.md +msgid "[@cxcxflying](https://github.com/cxcxflying)" +msgstr "[@cxcxflying](https://github.com/cxcxflying)" + +#: ../../community/contributors.md +msgid "2025/5/13" +msgstr "2025/5/13" + +#: ../../community/contributors.md +msgid "" +"[e564470](https://github.com/vllm-project/vllm-" +"ascend/commit/e56447033889ca95df512208cab22ef832bfdf07)" +msgstr "" +"[e564470](https://github.com/vllm-project/vllm-" +"ascend/commit/e56447033889ca95df512208cab22ef832bfdf07)" + +#: ../../community/contributors.md +msgid "47" +msgstr "47" + +#: ../../community/contributors.md +msgid "[@NeverRaR](https://github.com/NeverRaR)" +msgstr "[@NeverRaR](https://github.com/NeverRaR)" + +#: ../../community/contributors.md +msgid "2025/5/12" +msgstr "2025/5/12" + +#: ../../community/contributors.md +msgid "" +"[efabd72](https://github.com/vllm-project/vllm-" +"ascend/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1)" +msgstr "" +"[efabd72](https://github.com/vllm-project/vllm-" +"ascend/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1)" + +#: ../../community/contributors.md +msgid "46" +msgstr "46" + +#: ../../community/contributors.md +msgid "[@chris668899](https://github.com/chris668899)" +msgstr "[@chris668899](https://github.com/chris668899)" + +#: ../../community/contributors.md +msgid "2025/5/8" +msgstr "2025/5/8" + +#: ../../community/contributors.md +msgid "" +"[6c02088](https://github.com/vllm-project/vllm-" +"ascend/commit/6c020883a8332b5c519f4f6502733edd9b391c2b)" +msgstr "" +"[6c02088](https://github.com/vllm-project/vllm-" +"ascend/commit/6c020883a8332b5c519f4f6502733edd9b391c2b)" + +#: ../../community/contributors.md +msgid "45" +msgstr "45" + +#: ../../community/contributors.md +msgid "[@sunbaosong](https://github.com/sunbaosong)" +msgstr "[@sunbaosong](https://github.com/sunbaosong)" + +#: ../../community/contributors.md +msgid "2025/5/6" +msgstr "2025/5/6" + +#: ../../community/contributors.md +msgid "" +"[d6bfae8](https://github.com/vllm-project/vllm-" +"ascend/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4)" +msgstr "" +"[d6bfae8](https://github.com/vllm-project/vllm-" +"ascend/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4)" + +#: ../../community/contributors.md +msgid "44" +msgstr "44" + +#: ../../community/contributors.md +msgid "[@ApsarasX](https://github.com/ApsarasX)" +msgstr "[@ApsarasX](https://github.com/ApsarasX)" + +#: ../../community/contributors.md +msgid "2025/4/29" +msgstr "2025/4/29" + +#: ../../community/contributors.md +msgid "" +"[87975fa](https://github.com/vllm-project/vllm-" +"ascend/commit/87975fa058fe3f90d204ded42a08989a8dcb413e)" +msgstr "" +"[87975fa](https://github.com/vllm-project/vllm-" +"ascend/commit/87975fa058fe3f90d204ded42a08989a8dcb413e)" + +#: ../../community/contributors.md +msgid "43" +msgstr "43" + +#: ../../community/contributors.md +msgid "[@zouyida2052](https://github.com/zouyida2052)" +msgstr "[@zouyida2052](https://github.com/zouyida2052)" + +#: ../../community/contributors.md +msgid "2025/4/28" +msgstr "2025/4/28" + +#: ../../community/contributors.md +msgid "" +"[b9528e6](https://github.com/vllm-project/vllm-" +"ascend/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b)" +msgstr "" +"[b9528e6](https://github.com/vllm-project/vllm-" +"ascend/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b)" + +#: ../../community/contributors.md +msgid "42" +msgstr "42" + +#: ../../community/contributors.md +msgid "[@ZhengJun9](https://github.com/ZhengJun9)" +msgstr "[@ZhengJun9](https://github.com/ZhengJun9)" + +#: ../../community/contributors.md +msgid "" +"[1791113](https://github.com/vllm-project/vllm-" +"ascend/commit/17911138c90d78a76bd691e9dcb56763db35b19f)" +msgstr "" +"[1791113](https://github.com/vllm-project/vllm-" +"ascend/commit/17911138c90d78a76bd691e9dcb56763db35b19f)" + +#: ../../community/contributors.md +msgid "41" +msgstr "41" + +#: ../../community/contributors.md +msgid "[@linfeng-yuan](https://github.com/linfeng-yuan)" +msgstr "[@linfeng-yuan](https://github.com/linfeng-yuan)" + +#: ../../community/contributors.md +msgid "" +"[2204e4d](https://github.com/vllm-project/vllm-" +"ascend/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd)" +msgstr "" +"[2204e4d](https://github.com/vllm-project/vllm-" +"ascend/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd)" + +#: ../../community/contributors.md +msgid "40" +msgstr "40" + +#: ../../community/contributors.md +msgid "2025/4/27" +msgstr "2025/4/27" + +#: ../../community/contributors.md +msgid "" +"[fa4a5d9](https://github.com/vllm-project/vllm-" +"ascend/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5)" +msgstr "" +"[fa4a5d9](https://github.com/vllm-project/vllm-" +"ascend/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5)" + +#: ../../community/contributors.md +msgid "39" +msgstr "39" + +#: ../../community/contributors.md +msgid "[@fakeYan](https://github.com/fakeYan)" +msgstr "[@fakeYan](https://github.com/fakeYan)" + +#: ../../community/contributors.md +msgid "2025/4/23" +msgstr "2025/4/23" + +#: ../../community/contributors.md +msgid "" +"[05bdcbe](https://github.com/vllm-project/vllm-" +"ascend/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421)" +msgstr "" +"[05bdcbe](https://github.com/vllm-project/vllm-" +"ascend/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421)" + +#: ../../community/contributors.md +msgid "38" +msgstr "38" + +#: ../../community/contributors.md +msgid "[@RongRongStudio](https://github.com/RongRongStudio)" +msgstr "[@RongRongStudio](https://github.com/RongRongStudio)" + +#: ../../community/contributors.md +msgid "2025/4/22" +msgstr "2025/4/22" + +#: ../../community/contributors.md +msgid "" +"[848e041](https://github.com/vllm-project/vllm-" +"ascend/commit/848e041a54732c923660dd02daf8e9bf439736a2)" +msgstr "" +"[848e041](https://github.com/vllm-project/vllm-" +"ascend/commit/848e041a54732c923660dd02daf8e9bf439736a2)" + +#: ../../community/contributors.md +msgid "37" +msgstr "37" + +#: ../../community/contributors.md +msgid "[@paulyu12](https://github.com/paulyu12)" +msgstr "[@paulyu12](https://github.com/paulyu12)" + +#: ../../community/contributors.md +msgid "2025/4/17" +msgstr "2025/4/17" + +#: ../../community/contributors.md +msgid "" +"[697908f](https://github.com/vllm-project/vllm-" +"ascend/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e)" +msgstr "" +"[697908f](https://github.com/vllm-project/vllm-" +"ascend/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e)" + +#: ../../community/contributors.md +msgid "36" +msgstr "36" + +#: ../../community/contributors.md +msgid "[@heartStrive1998](https://github.com/heartStrive1998)" +msgstr "[@heartStrive1998](https://github.com/heartStrive1998)" + +#: ../../community/contributors.md +msgid "2025/4/16" +msgstr "2025/4/16" + +#: ../../community/contributors.md +msgid "" +"[2f15503](https://github.com/vllm-project/vllm-" +"ascend/commit/2f155039dc3997640854daef469bbf0cb77dc6ed)" +msgstr "" +"[2f15503](https://github.com/vllm-project/vllm-" +"ascend/commit/2f155039dc3997640854daef469bbf0cb77dc6ed)" + +#: ../../community/contributors.md +msgid "35" +msgstr "35" + +#: ../../community/contributors.md +msgid "[@eeethenQ](https://github.com/eeethenQ)" +msgstr "[@eeethenQ](https://github.com/eeethenQ)" + +#: ../../community/contributors.md +msgid "2025/4/15" +msgstr "2025/4/15" + +#: ../../community/contributors.md +msgid "" +"[44a8301](https://github.com/vllm-project/vllm-" +"ascend/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15)" +msgstr "" +"[44a8301](https://github.com/vllm-project/vllm-" +"ascend/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15)" + +#: ../../community/contributors.md +msgid "34" +msgstr "34" + +#: ../../community/contributors.md +msgid "[@wxsIcey](https://github.com/wxsIcey)" +msgstr "[@wxsIcey](https://github.com/wxsIcey)" + +#: ../../community/contributors.md +msgid "2025/4/10" +msgstr "2025/4/10" + +#: ../../community/contributors.md +msgid "" +"[d05ea17](https://github.com/vllm-project/vllm-" +"ascend/commit/d05ea17427b82a506b97409a7de8359f18f565f7)" +msgstr "" +"[d05ea17](https://github.com/vllm-project/vllm-" +"ascend/commit/d05ea17427b82a506b97409a7de8359f18f565f7)" + +#: ../../community/contributors.md +msgid "33" +msgstr "33" + +#: ../../community/contributors.md +msgid "[@yx0716](https://github.com/yx0716)" +msgstr "[@yx0716](https://github.com/yx0716)" + +#: ../../community/contributors.md +msgid "2025/4/8" +msgstr "2025/4/8" + +#: ../../community/contributors.md +msgid "" +"[5d62393](https://github.com/vllm-project/vllm-" +"ascend/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20)" +msgstr "" +"[5d62393](https://github.com/vllm-project/vllm-" +"ascend/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20)" + +#: ../../community/contributors.md +msgid "32" +msgstr "32" + +#: ../../community/contributors.md +msgid "[@celestialli](https://github.com/celestialli)" +msgstr "[@celestialli](https://github.com/celestialli)" + +#: ../../community/contributors.md +msgid "2025/4/7" +msgstr "2025/4/7" + +#: ../../community/contributors.md +msgid "" +"[2b765dc](https://github.com/vllm-project/vllm-" +"ascend/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0)" +msgstr "" +"[2b765dc](https://github.com/vllm-project/vllm-" +"ascend/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0)" + +#: ../../community/contributors.md +msgid "31" +msgstr "31" + +#: ../../community/contributors.md +msgid "[@hfadzxy](https://github.com/hfadzxy)" +msgstr "[@hfadzxy](https://github.com/hfadzxy)" + +#: ../../community/contributors.md +msgid "2025/3/30" +msgstr "2025/3/30" + +#: ../../community/contributors.md +msgid "" +"[7beb433](https://github.com/vllm-project/vllm-" +"ascend/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f)" +msgstr "" +"[7beb433](https://github.com/vllm-project/vllm-" +"ascend/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f)" + +#: ../../community/contributors.md +msgid "30" +msgstr "30" + +#: ../../community/contributors.md +msgid "[@wuhuikx](https://github.com/wuhuikx)" +msgstr "[@wuhuikx](https://github.com/wuhuikx)" + +#: ../../community/contributors.md +msgid "2025/3/28" +msgstr "2025/3/28" + +#: ../../community/contributors.md +msgid "" +"[57a84bb](https://github.com/vllm-project/vllm-" +"ascend/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca)" +msgstr "" +"[57a84bb](https://github.com/vllm-project/vllm-" +"ascend/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca)" + +#: ../../community/contributors.md +msgid "29" +msgstr "29" + +#: ../../community/contributors.md +msgid "[@zzzzwwjj](https://github.com/zzzzwwjj)" +msgstr "[@zzzzwwjj](https://github.com/zzzzwwjj)" + +#: ../../community/contributors.md +msgid "" +"[12390af](https://github.com/vllm-project/vllm-" +"ascend/commit/12390af075962456ecc8233d8dcce7064b75f390)" +msgstr "" +"[12390af](https://github.com/vllm-project/vllm-" +"ascend/commit/12390af075962456ecc8233d8dcce7064b75f390)" + +#: ../../community/contributors.md +msgid "28" +msgstr "28" + +#: ../../community/contributors.md +msgid "" +"[27e86b9](https://github.com/vllm-project/vllm-" +"ascend/commit/27e86b993a6a810d818143ec9dbfc439a419fa77)" +msgstr "" +"[27e86b9](https://github.com/vllm-project/vllm-" +"ascend/commit/27e86b993a6a810d818143ec9dbfc439a419fa77)" + +#: ../../community/contributors.md +msgid "27" +msgstr "27" + +#: ../../community/contributors.md +msgid "[@ZhengZhenyu](https://github.com/ZhengZhenyu)" +msgstr "[@ZhengZhenyu](https://github.com/ZhengZhenyu)" + +#: ../../community/contributors.md +msgid "2025/3/26" +msgstr "2025/3/26" + +#: ../../community/contributors.md +msgid "" +"[0b5a964](https://github.com/vllm-project/vllm-" +"ascend/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841)" +msgstr "" +"[0b5a964](https://github.com/vllm-project/vllm-" +"ascend/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841)" + +#: ../../community/contributors.md +msgid "26" +msgstr "26" + +#: ../../community/contributors.md +msgid "[@baifanxxx](https://github.com/baifanxxx)" +msgstr "[@baifanxxx](https://github.com/baifanxxx)" + +#: ../../community/contributors.md +msgid "" +"[1225052](https://github.com/vllm-project/vllm-" +"ascend/commit/122505208ff6284f409846ca7294f4a4b9883285)" +msgstr "" +"[1225052](https://github.com/vllm-project/vllm-" +"ascend/commit/122505208ff6284f409846ca7294f4a4b9883285)" + +#: ../../community/contributors.md +msgid "25" +msgstr "25" + +#: ../../community/contributors.md +msgid "[@rjg-lyh](https://github.com/rjg-lyh)" +msgstr "[@rjg-lyh](https://github.com/rjg-lyh)" + +#: ../../community/contributors.md +msgid "2025/3/13" +msgstr "2025/3/13" + +#: ../../community/contributors.md +msgid "" +"[6512470](https://github.com/vllm-project/vllm-" +"ascend/commit/65124705fb39d4cc2c94c80254421e067a82fe50)" +msgstr "" +"[6512470](https://github.com/vllm-project/vllm-" +"ascend/commit/65124705fb39d4cc2c94c80254421e067a82fe50)" + +#: ../../community/contributors.md +msgid "24" +msgstr "24" + +#: ../../community/contributors.md +msgid "[@xiemingda-1002](https://github.com/xiemingda-1002)" +msgstr "[@xiemingda-1002](https://github.com/xiemingda-1002)" + +#: ../../community/contributors.md +msgid "2025/3/12" +msgstr "2025/3/12" + +#: ../../community/contributors.md +msgid "" +"[59ea23d](https://github.com/vllm-project/vllm-" +"ascend/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5)" +msgstr "" +"[59ea23d](https://github.com/vllm-project/vllm-" +"ascend/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5)" + +#: ../../community/contributors.md +msgid "23" +msgstr "23" + +#: ../../community/contributors.md +msgid "[@yiz-liu](https://github.com/yiz-liu)" +msgstr "[@yiz-liu](https://github.com/yiz-liu)" + +#: ../../community/contributors.md +msgid "2025/3/11" +msgstr "2025/3/11" + +#: ../../community/contributors.md +msgid "" +"[0db6670](https://github.com/vllm-project/vllm-" +"ascend/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca)" +msgstr "" +"[0db6670](https://github.com/vllm-project/vllm-" +"ascend/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca)" + +#: ../../community/contributors.md +msgid "22" +msgstr "22" + +#: ../../community/contributors.md +msgid "[@new-TonyWang](https://github.com/new-TonyWang)" +msgstr "[@new-TonyWang](https://github.com/new-TonyWang)" + +#: ../../community/contributors.md +msgid "" +"[dfb4e23](https://github.com/vllm-project/vllm-" +"ascend/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e)" +msgstr "" +"[dfb4e23](https://github.com/vllm-project/vllm-" +"ascend/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e)" + +#: ../../community/contributors.md +msgid "21" +msgstr "21" + +#: ../../community/contributors.md +msgid "[@mengwei805](https://github.com/mengwei805)" +msgstr "[@mengwei805](https://github.com/mengwei805)" + +#: ../../community/contributors.md +msgid "2025/3/6" +msgstr "2025/3/6" + +#: ../../community/contributors.md +msgid "" +"[8fcf3d1](https://github.com/vllm-project/vllm-" +"ascend/commit/8fcf3d1704084626db35c5dc82ade446508598d4)" +msgstr "" +"[8fcf3d1](https://github.com/vllm-project/vllm-" +"ascend/commit/8fcf3d1704084626db35c5dc82ade446508598d4)" + +#: ../../community/contributors.md +msgid "20" +msgstr "20" + +#: ../../community/contributors.md +msgid "[@baymax591](https://github.com/baymax591)" +msgstr "[@baymax591](https://github.com/baymax591)" + +#: ../../community/contributors.md +msgid "2025/2/28" +msgstr "2025/2/28" + +#: ../../community/contributors.md +msgid "" +"[e8131b9](https://github.com/vllm-project/vllm-" +"ascend/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b)" +msgstr "" +"[e8131b9](https://github.com/vllm-project/vllm-" +"ascend/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b)" + +#: ../../community/contributors.md +msgid "19" +msgstr "19" + +#: ../../community/contributors.md +msgid "[@dependabot](https://github.com/dependabot)" +msgstr "[@dependabot](https://github.com/dependabot)" + +#: ../../community/contributors.md +msgid "2025/2/27" +msgstr "2025/2/27" + +#: ../../community/contributors.md +msgid "" +"[a5564ed](https://github.com/vllm-project/vllm-" +"ascend/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c)" +msgstr "" +"[a5564ed](https://github.com/vllm-project/vllm-" +"ascend/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c)" + +#: ../../community/contributors.md +msgid "18" +msgstr "18" + +#: ../../community/contributors.md +msgid "[@shink](https://github.com/shink)" +msgstr "[@shink](https://github.com/shink)" + +#: ../../community/contributors.md +msgid "" +"[6aed833](https://github.com/vllm-project/vllm-" +"ascend/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb)" +msgstr "" +"[6aed833](https://github.com/vllm-project/vllm-" +"ascend/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb)" + +#: ../../community/contributors.md +msgid "17" +msgstr "17" + +#: ../../community/contributors.md +msgid "[@wwfu109](https://github.com/wwfu109)" +msgstr "[@wwfu109](https://github.com/wwfu109)" + +#: ../../community/contributors.md +msgid "" +"[b074047](https://github.com/vllm-project/vllm-" +"ascend/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e)" +msgstr "" +"[b074047](https://github.com/vllm-project/vllm-" +"ascend/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e)" + +#: ../../community/contributors.md +msgid "16" +msgstr "16" + +#: ../../community/contributors.md +msgid "[@kunpengW-code](https://github.com/kunpengW-code)" +msgstr "[@kunpengW-code](https://github.com/kunpengW-code)" + +#: ../../community/contributors.md +msgid "2025/2/26" +msgstr "2025/2/26" + +#: ../../community/contributors.md +msgid "" +"[ca807ce](https://github.com/vllm-project/vllm-" +"ascend/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45)" +msgstr "" +"[ca807ce](https://github.com/vllm-project/vllm-" +"ascend/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45)" + +#: ../../community/contributors.md +msgid "15" +msgstr "15" + +#: ../../community/contributors.md +msgid "[@Yaphets24](https://github.com/Yaphets24)" +msgstr "[@Yaphets24](https://github.com/Yaphets24)" + +#: ../../community/contributors.md +msgid "2025/2/22" +msgstr "2025/2/22" + +#: ../../community/contributors.md +msgid "" +"[d0b3cb4](https://github.com/vllm-project/vllm-" +"ascend/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4)" +msgstr "" +"[d0b3cb4](https://github.com/vllm-project/vllm-" +"ascend/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4)" + +#: ../../community/contributors.md +msgid "14" +msgstr "14" + +#: ../../community/contributors.md +msgid "[@noemotiovon](https://github.com/noemotiovon)" +msgstr "[@noemotiovon](https://github.com/noemotiovon)" + +#: ../../community/contributors.md +msgid "2025/2/21" +msgstr "2025/2/21" + +#: ../../community/contributors.md +msgid "" +"[202b39a](https://github.com/vllm-project/vllm-" +"ascend/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0)" +msgstr "" +"[202b39a](https://github.com/vllm-project/vllm-" +"ascend/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0)" + +#: ../../community/contributors.md +msgid "13" +msgstr "13" + +#: ../../community/contributors.md +msgid "[@SidaoY](https://github.com/SidaoY)" +msgstr "[@SidaoY](https://github.com/SidaoY)" + +#: ../../community/contributors.md +msgid "2025/2/18" +msgstr "2025/2/18" + +#: ../../community/contributors.md +msgid "" +"[718c763](https://github.com/vllm-project/vllm-" +"ascend/commit/718c7638555d12cd43ea2a9e497e185778b68595)" +msgstr "" +"[718c763](https://github.com/vllm-project/vllm-" +"ascend/commit/718c7638555d12cd43ea2a9e497e185778b68595)" + +#: ../../community/contributors.md +msgid "12" +msgstr "12" + +#: ../../community/contributors.md +msgid "[@ShiyaNiu](https://github.com/ShiyaNiu)" +msgstr "[@ShiyaNiu](https://github.com/ShiyaNiu)" + +#: ../../community/contributors.md +msgid "2025/2/17" +msgstr "2025/2/17" + +#: ../../community/contributors.md +msgid "" +"[36ea38f](https://github.com/vllm-project/vllm-" +"ascend/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38)" +msgstr "" +"[36ea38f](https://github.com/vllm-project/vllm-" +"ascend/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38)" + +#: ../../community/contributors.md +msgid "11" +msgstr "11" + +#: ../../community/contributors.md +msgid "[@ji-huazhong](https://github.com/ji-huazhong)" +msgstr "[@ji-huazhong](https://github.com/ji-huazhong)" + +#: ../../community/contributors.md +msgid "2025/2/12" +msgstr "2025/2/12" + +#: ../../community/contributors.md +msgid "" +"[c8b57d1](https://github.com/vllm-project/vllm-" +"ascend/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82)" +msgstr "" +"[c8b57d1](https://github.com/vllm-project/vllm-" +"ascend/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82)" + +#: ../../community/contributors.md +msgid "10" +msgstr "10" + +#: ../../community/contributors.md +msgid "[@Angazenn](https://github.com/Angazenn)" +msgstr "[@Angazenn](https://github.com/Angazenn)" + +#: ../../community/contributors.md +msgid "2025/2/11" +msgstr "2025/2/11" + +#: ../../community/contributors.md +msgid "" +"[7637759](https://github.com/vllm-project/vllm-" +"ascend/commit/7637759056028839c74960d9cfd3ce6275ee5d35)" +msgstr "" +"[7637759](https://github.com/vllm-project/vllm-" +"ascend/commit/7637759056028839c74960d9cfd3ce6275ee5d35)" + +#: ../../community/contributors.md +msgid "9" +msgstr "9" + +#: ../../community/contributors.md +msgid "[@whx-sjtu](https://github.com/whx-sjtu)" +msgstr "[@whx-sjtu](https://github.com/whx-sjtu)" + +#: ../../community/contributors.md +msgid "2025/2/7" +msgstr "2025/2/7" + +#: ../../community/contributors.md +msgid "" +"[8fc5dc9](https://github.com/vllm-project/vllm-" +"ascend/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e)" +msgstr "" +"[8fc5dc9](https://github.com/vllm-project/vllm-" +"ascend/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e)" + +#: ../../community/contributors.md +msgid "8" +msgstr "8" + +#: ../../community/contributors.md +msgid "[@zouyida2002](https://github.com/zouyida2002)" +msgstr "[@zouyida2002](https://github.com/zouyida2002)" + +#: ../../community/contributors.md +msgid "" +"[4495fc6](https://github.com/vllm-project/vllm-" +"ascend/commit/4495fc68389e3fb1ef14534c202948931e38446b)" +msgstr "" +"[4495fc6](https://github.com/vllm-project/vllm-" +"ascend/commit/4495fc68389e3fb1ef14534c202948931e38446b)" + +#: ../../community/contributors.md +msgid "7" +msgstr "7" + +#: ../../community/contributors.md +msgid "[@hw_whx](https://github.com/hw_whx)" +msgstr "[@hw_whx](https://github.com/hw_whx)" + +#: ../../community/contributors.md +msgid "" +"[7d16772](https://github.com/vllm-project/vllm-" +"ascend/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a)" +msgstr "" +"[7d16772](https://github.com/vllm-project/vllm-" +"ascend/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a)" + +#: ../../community/contributors.md +msgid "6" +msgstr "6" + +#: ../../community/contributors.md +msgid "[@MengqingCao](https://github.com/MengqingCao)" +msgstr "[@MengqingCao](https://github.com/MengqingCao)" + +#: ../../community/contributors.md +msgid "2025/2/6" +msgstr "2025/2/6" + +#: ../../community/contributors.md +msgid "" +"[7d9ae22](https://github.com/vllm-project/vllm-" +"ascend/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730)" +msgstr "" +"[7d9ae22](https://github.com/vllm-project/vllm-" +"ascend/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730)" + +#: ../../community/contributors.md +msgid "5" +msgstr "5" + +#: ../../community/contributors.md +msgid "[@Potabk](https://github.com/Potabk)" +msgstr "[@Potabk](https://github.com/Potabk)" + +#: ../../community/contributors.md +msgid "" +"[8cb5615](https://github.com/vllm-project/vllm-" +"ascend/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86)" +msgstr "" +"[8cb5615](https://github.com/vllm-project/vllm-" +"ascend/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86)" + +#: ../../community/contributors.md +msgid "4" +msgstr "4" + +#: ../../community/contributors.md +msgid "" +"[a48b9ad](https://github.com/vllm-project/vllm-" +"ascend/commit/a48b9addefd292af523644411d4ff4142dd4bc66)" +msgstr "" +"[a48b9ad](https://github.com/vllm-project/vllm-" +"ascend/commit/a48b9addefd292af523644411d4ff4142dd4bc66)" + +#: ../../community/contributors.md +msgid "3" +msgstr "3" + +#: ../../community/contributors.md +msgid "[@shen-shanshan](https://github.com/shen-shanshan)" +msgstr "[@shen-shanshan](https://github.com/shen-shanshan)" + +#: ../../community/contributors.md +msgid "" +"[bfccf73](https://github.com/vllm-project/vllm-" +"ascend/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e)" +msgstr "" +"[bfccf73](https://github.com/vllm-project/vllm-" +"ascend/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e)" + +#: ../../community/contributors.md +msgid "2" +msgstr "2" + +#: ../../community/contributors.md +msgid "2025/2/5" +msgstr "2025/2/5" + +#: ../../community/contributors.md +msgid "" +"[d5e7756](https://github.com/vllm-project/vllm-" +"ascend/commit/d5e7756028bd5884ade96b654555c375770a2f64)" +msgstr "" +"[d5e7756](https://github.com/vllm-project/vllm-" +"ascend/commit/d5e7756028bd5884ade96b654555c375770a2f64)" + +#: ../../community/contributors.md +msgid "1" +msgstr "1" + +#: ../../community/contributors.md +msgid "[@simon-mo](https://github.com/simon-mo)" +msgstr "[@simon-mo](https://github.com/simon-mo)" + +#: ../../community/contributors.md +msgid "2025/1/29" +msgstr "2025/1/29" + +#: ../../community/contributors.md +msgid "" +"[eb28342](https://github.com/vllm-project/vllm-" +"ascend/commit/eb283428ddc17207b6866118f9bc15454b5b8801)" +msgstr "" +"[eb28342](https://github.com/vllm-project/vllm-" +"ascend/commit/eb283428ddc17207b6866118f9bc15454b5b8801)" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po new file mode 100644 index 0000000..030aa24 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po @@ -0,0 +1,204 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../community/governance.md:1 +msgid "Governance" +msgstr "治理" + +#: ../../community/governance.md:3 +msgid "Mission" +msgstr "使命" + +#: ../../community/governance.md:4 +msgid "" +"As a vital component of vLLM, the vLLM Ascend project is dedicated to " +"providing an easy, fast, and cheap LLM Serving for Everyone on Ascend NPU, " +"and to actively contribute to the enrichment of vLLM." +msgstr "" +"作为 vLLM 的重要组成部分,vLLM Ascend 项目致力于为所有人在 Ascend NPU 上提供简单、快速且低成本的大语言模型服务,并积极促进" +" vLLM 的丰富发展。" + +#: ../../community/governance.md:6 +msgid "Principles" +msgstr "原则" + +#: ../../community/governance.md:7 +msgid "" +"vLLM Ascend follows the vLLM community's code of conduct:[vLLM - CODE OF " +"CONDUCT](https://github.com/vllm-project/vllm/blob/main/CODE_OF_CONDUCT.md)" +msgstr "" +"vLLM Ascend 遵循 vLLM 社区的行为准则:[vLLM - 行为准则](https://github.com/vllm-" +"project/vllm/blob/main/CODE_OF_CONDUCT.md)" + +#: ../../community/governance.md:9 +msgid "Governance - Mechanics" +msgstr "治理 - 机制" + +#: ../../community/governance.md:10 +msgid "" +"vLLM Ascend is an open-source project under the vLLM community, where the " +"authority to appoint roles is ultimately determined by the vLLM community. " +"It adopts a hierarchical technical governance structure." +msgstr "vLLM Ascend 是 vLLM 社区下的一个开源项目,其角色任命权最终由 vLLM 社区决定。它采用分层的技术治理结构。" + +#: ../../community/governance.md:12 +msgid "Contributor:" +msgstr "贡献者:" + +#: ../../community/governance.md:14 +msgid "" +"**Responsibility:** Help new contributors on boarding, handle and respond to" +" community questions, review RFCs, code" +msgstr "**职责:** 帮助新贡献者加入,处理和回复社区问题,审查RFC和代码" + +#: ../../community/governance.md:16 +msgid "" +"**Requirements:** Complete at least 1 contribution. Contributor is someone " +"who consistently and actively participates in a project, included but not " +"limited to issue/review/commits/community involvement." +msgstr "**要求:** 完成至少1次贡献。贡献者是指持续且积极参与项目的人,包括但不限于问题、评审、提交和社区参与。" + +#: ../../community/governance.md:18 +msgid "" +"Contributors will be empowered [vllm-project/vllm-" +"ascend](https://github.com/vllm-project/vllm-ascend) Github repo `Triage` " +"permissions (`Can read and clone this repository. Can also manage issues and" +" pull requests`) to help community developers collaborate more efficiently." +msgstr "" +"贡献者将被赋予 [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-" +"ascend) Github 仓库的 `Triage` 权限(`可读取和克隆此仓库。还可以管理问题和拉取请求`),以帮助社区开发者更加高效地协作。" + +#: ../../community/governance.md:20 +msgid "Maintainer:" +msgstr "维护者:" + +#: ../../community/governance.md:22 +msgid "" +"**Responsibility:** Develop the project's vision and mission. Maintainers " +"are responsible for driving the technical direction of the entire project " +"and ensuring its overall success, possessing code merge permissions. They " +"formulate the roadmap, review contributions from community members, " +"continuously contribute code, and actively engage in community activities " +"(such as regular meetings/events)." +msgstr "" +"**责任:** " +"制定项目的愿景和使命。维护者负责引领整个项目的技术方向并确保其整体成功,拥有代码合并权限。他们制定路线图,审核社区成员的贡献,持续贡献代码,并积极参与社区活动(如定期会议/活动)。" + +#: ../../community/governance.md:24 +msgid "" +"**Requirements:** Deep understanding of ‌vLLM‌ and ‌vLLM Ascend‌ codebases, " +"with a commitment to sustained code contributions. Competency in " +"‌design/development/PR review workflows‌." +msgstr "" +"**要求:** 深入理解 ‌vLLM‌ 和 ‌vLLM Ascend‌ 代码库,并承诺持续贡献代码。具备 ‌设计/开发/PR 审核流程‌ 的能力。" + +#: ../../community/governance.md:25 +msgid "" +"**Review Quality‌:** Actively participate in community code reviews, " +"ensuring high-quality code integration." +msgstr "**评审质量:** 积极参与社区代码评审,确保高质量的代码集成。" + +#: ../../community/governance.md:26 +msgid "" +"**Quality Contribution‌:** Successfully develop and deliver at least one " +"major feature while maintaining consistent high-quality contributions." +msgstr "**质量贡献‌:** 成功开发并交付至少一个主要功能,同时持续保持高质量的贡献。" + +#: ../../community/governance.md:27 +msgid "" +"**Community Involvement‌:** Actively address issues, respond to forum " +"inquiries, participate in discussions, and engage in community-driven tasks." +msgstr "**社区参与:** 积极解决问题,回复论坛询问,参与讨论,并参与社区驱动的任务。" + +#: ../../community/governance.md:29 +msgid "" +"Requires approval from existing Maintainers. The vLLM community has the " +"final decision-making authority." +msgstr "需要现有维护者的批准。vLLM社区拥有最终决策权。" + +#: ../../community/governance.md:31 +msgid "" +"Maintainer will be empowered [vllm-project/vllm-" +"ascend](https://github.com/vllm-project/vllm-ascend) Github repo write " +"permissions (`Can read, clone, and push to this repository. Can also manage " +"issues and pull requests`)." +msgstr "" +"维护者将被授予 [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-" +"ascend) Github 仓库的写入权限(`可以读取、克隆和推送到此仓库。还可以管理问题和拉取请求`)。" + +#: ../../community/governance.md:33 +msgid "Nominating and Removing Maintainers" +msgstr "提名和移除维护者" + +#: ../../community/governance.md:35 +msgid "The Principles" +msgstr "原则" + +#: ../../community/governance.md:37 +msgid "" +"Membership in vLLM Ascend is given to individuals on merit basis after they " +"demonstrated strong expertise of the vLLM / vLLM Ascend through " +"contributions, reviews and discussions." +msgstr "" +"vLLM Ascend 的成员资格是基于个人能力授予的,只有在通过贡献、评审和讨论展示出对 vLLM / vLLM Ascend " +"的深厚专业知识后,才可获得。" + +#: ../../community/governance.md:39 +msgid "" +"For membership in the maintainer group the individual has to demonstrate " +"strong and continued alignment with the overall vLLM / vLLM Ascend " +"principles." +msgstr "要成为维护者组成员,个人必须表现出与 vLLM / vLLM Ascend 总体原则的高度一致并持续支持。" + +#: ../../community/governance.md:41 +msgid "" +"Light criteria of moving module maintenance to ‘emeritus’ status if they " +"don’t actively participate over long periods of time." +msgstr "如果模块维护人员在长时间内没有积极参与,可根据较宽松的标准将其维护状态转为“荣誉”状态。" + +#: ../../community/governance.md:43 +msgid "The membership is for an individual, not a company." +msgstr "该会员资格属于个人,而非公司。" + +#: ../../community/governance.md:45 +msgid "Nomination and Removal" +msgstr "提名与罢免" + +#: ../../community/governance.md:47 +msgid "" +"Nomination: Anyone can nominate someone to become a maintainer (include " +"self-nominate). All existing maintainers are responsible for evaluating the " +"nomination. The nominator should provide nominee's info around the strength " +"of the candidate to be a maintainer, include but not limited to review " +"quality, quality contribution, community involvement." +msgstr "" +"提名:任何人都可以提名他人成为维护者(包括自荐)。所有现有维护者都有责任评估提名。提名人应提供被提名人成为维护者的相关优势信息,包括但不限于评审质量、优质贡献、社区参与等。" + +#: ../../community/governance.md:48 +msgid "" +"Removal: Anyone can nominate a person to be removed from maintainer position" +" (include self-nominate). All existing maintainers are responsible for " +"evaluating the nomination. The nominator should provide nominee's info, " +"include but not limited to lack of activity, conflict with the overall " +"direction and other information that makes them unfit to be a maintainer." +msgstr "" +"移除:任何人都可以提名某人被移出维护者职位(包括自荐)。所有现有维护者都有责任评估该提名。提名者应提供被提名人的相关信息,包括但不限于缺乏活动、与整体方向冲突以及使其不适合作为维护者的其他信息。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po new file mode 100644 index 0000000..3d91ba7 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po @@ -0,0 +1,103 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../community/user_stories/index.md:15 +msgid "More details" +msgstr "更多细节" + +#: ../../community/user_stories/index.md:1 +msgid "User Stories" +msgstr "用户故事" + +#: ../../community/user_stories/index.md:3 +msgid "" +"Read case studies on how users and developers solves real, everyday problems" +" with vLLM Ascend" +msgstr "阅读案例研究,了解用户和开发者如何使用 vLLM Ascend 解决实际日常问题。" + +#: ../../community/user_stories/index.md:5 +msgid "" +"[LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform " +"for training and fine-tuning large language models, it supports vLLM Ascend " +"to speed up inference since [LLaMA-" +"Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x " +"performance enhancement of inference." +msgstr "" +"[LLaMA-Factory](./llamafactory.md) 是一个易于使用且高效的大语言模型训练与微调平台,自 [LLaMA-" +"Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739) 起支持 vLLM " +"Ascend 加速推理,推理性能提升 2 倍。" + +#: ../../community/user_stories/index.md:7 +msgid "" +"[Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge " +"library designed for post-training foundation models using advanced " +"techniques like SFT, PPO and DPO, it uses vLLM Ascend since " +"[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to " +"support RLHF on Ascend NPU." +msgstr "" +"[Huggingface/trl](https://github.com/huggingface/trl) 是一个前沿的库,专为使用 SFT、PPO 和" +" DPO 等先进技术对基础模型进行后训练而设计。从 " +"[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) 版本开始,该库利用" +" vLLM Ascend 来支持在 Ascend NPU 上进行 RLHF。" + +#: ../../community/user_stories/index.md:9 +msgid "" +"[MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference " +"engine acceleration plug-in library developed by Huawei on Ascend hardware, " +"which includes self-developed large language model optimization algorithms " +"and optimizations related to the inference engine framework. It supports " +"vLLM Ascend since " +"[2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-" +"turbo-0001.html)." +msgstr "" +"[MindIE Turbo](https://pypi.org/project/mindie-turbo) " +"是华为在昇腾硬件上开发的一款用于加速LLM推理引擎的插件库,包含自主研发的大语言模型优化算法及与推理引擎框架相关的优化。从 " +"[2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-" +"turbo-0001.html) 起,支持 vLLM Ascend。" + +#: ../../community/user_stories/index.md:11 +msgid "" +"[GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU " +"cluster manager for running AI models. It supports vLLM Ascend since " +"[v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more" +" GPUStack performance evaluation info on " +"[link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)." +msgstr "" +"[GPUStack](https://github.com/gpustack/gpustack) 是一个开源的 GPU 集群管理器,用于运行 AI " +"模型。从 [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2) " +"版本开始支持 vLLM Ascend,更多 GPUStack 性能评测信息见 " +"[链接](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)。" + +#: ../../community/user_stories/index.md:13 +msgid "" +"[verl](https://github.com/volcengine/verl) is a flexible, efficient and " +"production-ready RL training library for large language models (LLMs), uses " +"vLLM Ascend since " +"[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more " +"info on [verl x Ascend " +"Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html)." +msgstr "" +"[verl](https://github.com/volcengine/verl) " +"是一个灵活、高效且可用于生产环境的大型语言模型(LLM)强化学习训练库,自 " +"[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0) 起支持 vLLM " +"Ascend,更多信息请参见 [verl x Ascend " +"快速上手](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po new file mode 100644 index 0000000..4e8c771 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po @@ -0,0 +1,87 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../community/user_stories/llamafactory.md:1 +msgid "LLaMA-Factory" +msgstr "LLaMA-Factory" + +#: ../../community/user_stories/llamafactory.md:3 +msgid "**About / Introduction**" +msgstr "**关于 / 介绍**" + +#: ../../community/user_stories/llamafactory.md:5 +msgid "" +"[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use " +"and efficient platform for training and fine-tuning large language models. " +"With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally" +" without writing any code." +msgstr "" +"[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) " +"是一个易于使用且高效的平台,用于训练和微调大型语言模型。有了 LLaMA-Factory,你可以在本地对数百个预训练模型进行微调,无需编写任何代码。" + +#: ../../community/user_stories/llamafactory.md:7 +msgid "" +"LLaMA-Facotory users need to evaluate and inference the model after fine-" +"tuning the model." +msgstr "LLaMA-Facotory 用户需要在对模型进行微调后对模型进行评估和推理。" + +#: ../../community/user_stories/llamafactory.md:9 +msgid "**The Business Challenge**" +msgstr "**业务挑战**" + +#: ../../community/user_stories/llamafactory.md:11 +msgid "" +"LLaMA-Factory used transformers to perform inference on Ascend NPU, but the " +"speed was slow." +msgstr "LLaMA-Factory 使用 transformers 在 Ascend NPU 上进行推理,但速度较慢。" + +#: ../../community/user_stories/llamafactory.md:13 +msgid "**Solving Challenges and Benefits with vLLM Ascend**" +msgstr "**通过 vLLM Ascend 解决挑战与收益**" + +#: ../../community/user_stories/llamafactory.md:15 +msgid "" +"With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-" +"Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the " +"performance of LLaMA-Factory in the model inference stage has been " +"significantly improved. According to the test results, the inference speed " +"of LLaMA-Factory has been increased to 2x compared to the transformers " +"version." +msgstr "" +"在 LLaMA-Factory 和 vLLM Ascend 的共同努力下(参见 [LLaMA-" +"Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)),LLaMA-" +"Factory 在模型推理阶段的性能得到了显著提升。根据测试结果,LLaMA-Factory 的推理速度相比 transformers 版本提升到了 2" +" 倍。" + +#: ../../community/user_stories/llamafactory.md:17 +msgid "**Learn more**" +msgstr "**了解更多**" + +#: ../../community/user_stories/llamafactory.md:19 +msgid "" +"See more about LLaMA-Factory and how it uses vLLM Ascend for inference on " +"the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU " +"Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)." +msgstr "" +"在以下文档中查看更多关于 LLaMA-Factory 以及其如何在 Ascend NPU 上使用 vLLM Ascend 进行推理的信息:[LLaMA-" +"Factory Ascend NPU " +"推理](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po b/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po new file mode 100644 index 0000000..62b2a48 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po @@ -0,0 +1,624 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../community/versioning_policy.md:1 +msgid "Versioning policy" +msgstr "版本管理策略" + +#: ../../community/versioning_policy.md:3 +msgid "" +"Starting with vLLM 0.7.x, the vLLM Ascend Plugin ([vllm-project/vllm-" +"ascend](https://github.com/vllm-project/vllm-ascend)) project follows the " +"[PEP 440](https://peps.python.org/pep-0440/) to publish matching with vLLM " +"([vllm-project/vllm](https://github.com/vllm-project/vllm))." +msgstr "" +"从 vLLM 0.7.x 开始,vLLM Ascend 插件([vllm-project/vllm-" +"ascend](https://github.com/vllm-project/vllm-ascend))项目遵循 [PEP " +"440](https://peps.python.org/pep-0440/) ,以与 vLLM([vllm-" +"project/vllm](https://github.com/vllm-project/vllm))版本匹配发布。" + +#: ../../community/versioning_policy.md:5 +msgid "vLLM Ascend Plugin versions" +msgstr "vLLM Ascend 插件版本" + +#: ../../community/versioning_policy.md:7 +msgid "" +"Each vLLM Ascend release will be versioned: " +"`v[major].[minor].[micro][rcN][.postN]` (such as `v0.7.3rc1`, `v0.7.3`, " +"`v0.7.3.post1`)" +msgstr "" +"每个 vLLM Ascend 版本将采用以下版本格式:`v[major].[minor].[micro][rcN][.postN]`(例如 " +"`v0.7.3rc1`、`v0.7.3`、`v0.7.3.post1`)" + +#: ../../community/versioning_policy.md:10 +msgid "" +"**Final releases**: will typically be released every **3 months**, will take" +" the vLLM upstream release plan and Ascend software product release plan " +"into comprehensive consideration." +msgstr "**正式版本**:通常每**3个月**发布一次,将综合考虑 vLLM 上游发行计划和昇腾软件产品发行计划。" + +#: ../../community/versioning_policy.md:11 +msgid "" +"**Pre releases**: will typically be released **on demand**, ending with rcN," +" represents the Nth release candidate version, to support early testing by " +"our users prior to a final release." +msgstr "**预发布版本**:通常会**按需发布**,以 rcN 结尾,表示第N个候选发布版本,旨在支持用户在正式发布前进行早期测试。" + +#: ../../community/versioning_policy.md:12 +msgid "" +"**Post releases**: will typically be released **on demand** to support to " +"address minor errors in a final release. It's different from [PEP-440 post " +"release note](https://peps.python.org/pep-0440/#post-releases) suggestion, " +"it will contain actual bug fixes considering that the final release version " +"should be matched strictly with the vLLM final release version " +"(`v[major].[minor].[micro]`). The post version has to be published as a " +"patch version of the final release." +msgstr "" +"**后续版本**:通常会根据需要发布,以支持解决正式发布中的小错误。这与 [PEP-440 " +"的后续版本说明](https://peps.python.org/pep-0440/#post-releases) 建议不同,它将包含实际的 bug " +"修复,因为最终发布版本应严格与 vLLM " +"的最终发布版本(`v[major].[minor].[micro]`)匹配。后续版本必须以正式发布的补丁版本形式发布。" + +#: ../../community/versioning_policy.md:14 +msgid "For example:" +msgstr "例如:" + +#: ../../community/versioning_policy.md:15 +msgid "" +"`v0.7.x`: it's the first final release to match the vLLM `v0.7.x` version." +msgstr "`v0.7.x`:这是第一个与 vLLM `v0.7.x` 版本相匹配的正式发布版本。" + +#: ../../community/versioning_policy.md:16 +msgid "`v0.7.3rc1`: will be the first pre version of vLLM Ascend." +msgstr "`v0.7.3rc1`:将会是 vLLM Ascend 的第一个预发布版本。" + +#: ../../community/versioning_policy.md:17 +msgid "" +"`v0.7.3.post1`: will be the post release if the `v0.7.3` release has some " +"minor errors." +msgstr "`v0.7.3.post1`:如果 `v0.7.3` 版本发布有一些小错误,将作为后续修正版发布。" + +#: ../../community/versioning_policy.md:19 +msgid "Release Compatibility Matrix" +msgstr "版本兼容性矩阵" + +#: ../../community/versioning_policy.md:21 +msgid "Following is the Release Compatibility Matrix for vLLM Ascend Plugin:" +msgstr "以下是 vLLM Ascend 插件的版本兼容性矩阵:" + +#: ../../community/versioning_policy.md +msgid "vLLM Ascend" +msgstr "vLLM Ascend" + +#: ../../community/versioning_policy.md +msgid "vLLM" +msgstr "vLLM" + +#: ../../community/versioning_policy.md +msgid "Python" +msgstr "Python" + +#: ../../community/versioning_policy.md +msgid "Stable CANN" +msgstr "Stable CANN" + +#: ../../community/versioning_policy.md +msgid "PyTorch/torch_npu" +msgstr "PyTorch/torch_npu" + +#: ../../community/versioning_policy.md +msgid "MindIE Turbo" +msgstr "MindIE Turbo" + +#: ../../community/versioning_policy.md +msgid "v0.9.2rc1" +msgstr "v0.9.2rc1" + +#: ../../community/versioning_policy.md +msgid "v0.9.2" +msgstr "v0.9.2" + +#: ../../community/versioning_policy.md +msgid ">= 3.9, < 3.12" +msgstr ">= 3.9,< 3.12" + +#: ../../community/versioning_policy.md +msgid "8.1.RC1" +msgstr "8.1.RC1" + +#: ../../community/versioning_policy.md +msgid "2.5.1 / 2.5.1.post1.dev20250619" +msgstr "2.5.1 / 2.5.1.post1.dev20250619" + +#: ../../community/versioning_policy.md +msgid "v0.9.1rc1" +msgstr "v0.9.1rc1" + +#: ../../community/versioning_policy.md +msgid "v0.9.1" +msgstr "v0.9.1" + +#: ../../community/versioning_policy.md +msgid "2.5.1 / 2.5.1.post1.dev20250528" +msgstr "2.5.1 / 2.5.1.post1.dev20250528" + +#: ../../community/versioning_policy.md +msgid "v0.9.0rc2" +msgstr "v0.9.0rc2" + +#: ../../community/versioning_policy.md +msgid "v0.9.0" +msgstr "v0.9.0" + +#: ../../community/versioning_policy.md +msgid "2.5.1 / 2.5.1" +msgstr "2.5.1 / 2.5.1" + +#: ../../community/versioning_policy.md +msgid "v0.9.0rc1" +msgstr "v0.9.0rc1" + +#: ../../community/versioning_policy.md +msgid "v0.8.5rc1" +msgstr "v0.8.5rc1" + +#: ../../community/versioning_policy.md +msgid "v0.8.5.post1" +msgstr "v0.8.5.post1" + +#: ../../community/versioning_policy.md +msgid "v0.8.4rc2" +msgstr "v0.8.4rc2" + +#: ../../community/versioning_policy.md +msgid "v0.8.4" +msgstr "v0.8.4" + +#: ../../community/versioning_policy.md +msgid "8.0.0" +msgstr "8.0.0" + +#: ../../community/versioning_policy.md +msgid "v0.7.3.post1" +msgstr "v0.7.3.post1" + +#: ../../community/versioning_policy.md +msgid "v0.7.3" +msgstr "v0.7.3" + +#: ../../community/versioning_policy.md +msgid "2.0rc1" +msgstr "2.0候选版本1" + +#: ../../community/versioning_policy.md:34 +msgid "Release cadence" +msgstr "发布节奏" + +#: ../../community/versioning_policy.md:36 +msgid "release window" +msgstr "发布窗口" + +#: ../../community/versioning_policy.md +msgid "Date" +msgstr "日期" + +#: ../../community/versioning_policy.md +msgid "Event" +msgstr "事件" + +#: ../../community/versioning_policy.md +msgid "2025.07.11" +msgstr "2025.07.11" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.9.2rc1" +msgstr "候选发布版本,v0.9.2rc1" + +#: ../../community/versioning_policy.md +msgid "2025.06.22" +msgstr "2025.06.22" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.9.1rc1" +msgstr "候选发布版本,v0.9.1rc1" + +#: ../../community/versioning_policy.md +msgid "2025.06.10" +msgstr "2025.06.10" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.9.0rc2" +msgstr "候选发布版本,v0.9.0rc2" + +#: ../../community/versioning_policy.md +msgid "2025.06.09" +msgstr "2025.06.09" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.9.0rc1" +msgstr "候选发布版本本,v0.9.0rc1" + +#: ../../community/versioning_policy.md +msgid "2025.05.29" +msgstr "2025.05.29" + +#: ../../community/versioning_policy.md +msgid "v0.7.x post release, v0.7.3.post1" +msgstr "v0.7.x 补丁版,v0.7.3.post1" + +#: ../../community/versioning_policy.md +msgid "2025.05.08" +msgstr "2025.05.08" + +#: ../../community/versioning_policy.md +msgid "v0.7.x Final release, v0.7.3" +msgstr "v0.7.x 正式版,v0.7.3" + +#: ../../community/versioning_policy.md +msgid "2025.05.06" +msgstr "2025.05.06" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.8.5rc1" +msgstr "候选发布版本,v0.8.5rc1" + +#: ../../community/versioning_policy.md +msgid "2025.04.28" +msgstr "2025.04.28" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.8.4rc2" +msgstr "候选发布版本,v0.8.4rc2" + +#: ../../community/versioning_policy.md +msgid "2025.04.18" +msgstr "2025.04.18" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.8.4rc1" +msgstr "候选发布版本,v0.8.4rc1" + +#: ../../community/versioning_policy.md +msgid "2025.03.28" +msgstr "2025.03.28" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.7.3rc2" +msgstr "候选发布版本,v0.7.3rc2" + +#: ../../community/versioning_policy.md +msgid "2025.03.14" +msgstr "2025.03.14" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.7.3rc1" +msgstr "候选发布版本,v0.7.3rc1" + +#: ../../community/versioning_policy.md +msgid "2025.02.19" +msgstr "2025.02.19" + +#: ../../community/versioning_policy.md +msgid "Release candidates, v0.7.1rc1" +msgstr "候选发布版本,v0.7.1rc1" + +#: ../../community/versioning_policy.md:53 +msgid "Branch policy" +msgstr "分支策略" + +#: ../../community/versioning_policy.md:55 +msgid "vLLM Ascend has main branch and dev branch." +msgstr "vLLM Ascend 有主分支和开发分支。" + +#: ../../community/versioning_policy.md:57 +msgid "" +"**main**: main branch,corresponds to the vLLM main branch and latest 1 or 2 " +"release version. It is continuously monitored for quality through Ascend CI." +msgstr "**main**:main 分支,对应 vLLM 的主分支和最新的 1 或 2 个发布版本。该分支通过 Ascend CI 持续监控质量。" + +#: ../../community/versioning_policy.md:58 +msgid "" +"**vX.Y.Z-dev**: development branch, created with part of new releases of " +"vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version." +msgstr "" +"**vX.Y.Z-dev**:开发分支,是随着 vLLM 新版本的一部分一起创建的。例如,`v0.7.3-dev` 是 vLLM `v0.7.3` " +"版本的开发分支。" + +#: ../../community/versioning_policy.md:60 +msgid "" +"Usually, a commit should be ONLY first merged in the main branch, and then " +"backported to the dev branch to reduce maintenance costs as much as " +"possible." +msgstr "通常,提交应该只先合并到主分支,然后再回溯合并到开发分支,以尽可能降低维护成本。" + +#: ../../community/versioning_policy.md:62 +msgid "Maintenance branch and EOL:" +msgstr "维护分支与生命周期结束(EOL):" + +#: ../../community/versioning_policy.md:63 +msgid "The branch status will be in one of the following states:" +msgstr "分支状态将处于以下几种状态之一:" + +#: ../../community/versioning_policy.md +msgid "Branch" +msgstr "分支" + +#: ../../community/versioning_policy.md +msgid "Time frame" +msgstr "时间范围" + +#: ../../community/versioning_policy.md +msgid "Summary" +msgstr "摘要" + +#: ../../community/versioning_policy.md +msgid "Maintained" +msgstr "维护中" + +#: ../../community/versioning_policy.md +msgid "Approximately 2-3 minor versions" +msgstr "大约 2-3 个小版本" + +#: ../../community/versioning_policy.md +msgid "All bugfixes are appropriate. Releases produced, CI commitment." +msgstr "所有的错误修复都是合适的。正常发布版本,持续集成承诺。" + +#: ../../community/versioning_policy.md +msgid "Unmaintained" +msgstr "无人维护" + +#: ../../community/versioning_policy.md +msgid "Community interest driven" +msgstr "社区兴趣驱动" + +#: ../../community/versioning_policy.md +msgid "All bugfixes are appropriate. No Releases produced, No CI commitment" +msgstr "所有的 bug 修复都是合适的。没有发布版本,不承诺持续集成(CI)。" + +#: ../../community/versioning_policy.md +msgid "End of Life (EOL)" +msgstr "生命周期结束(EOL)" + +#: ../../community/versioning_policy.md +msgid "N/A" +msgstr "不适用" + +#: ../../community/versioning_policy.md +msgid "Branch no longer accepting changes" +msgstr "该分支不再接受更改" + +#: ../../community/versioning_policy.md:71 +msgid "Branch state" +msgstr "分支状态" + +#: ../../community/versioning_policy.md:73 +msgid "" +"Note that vLLM Ascend will only be released for a certain vLLM release " +"version rather than all versions. Hence, You might see only part of versions" +" have dev branches (such as only `0.7.1-dev` / `0.7.3-dev` but no " +"`0.7.2-dev`), this is as expected." +msgstr "" +"请注意,vLLM Ascend 只会针对某些 vLLM 发布版本发布,而不是所有版本。因此,您可能会看到只有部分版本拥有开发分支(例如只有 " +"`0.7.1-dev` / `0.7.3-dev`,而没有 `0.7.2-dev`),这是正常现象。" + +#: ../../community/versioning_policy.md:75 +msgid "" +"Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM " +"Ascend version branch and support its latest version (for example, we plan " +"to support version 0.7.3) as following shown:" +msgstr "" +"通常,vLLM 的每一个小版本(例如 0.7)都会对应一个 vLLM Ascend 版本分支,并支持其最新版本(例如,我们计划支持 0.7.3 " +"版),如下所示:" + +#: ../../community/versioning_policy.md +msgid "Status" +msgstr "状态" + +#: ../../community/versioning_policy.md +msgid "Note" +msgstr "注释" + +#: ../../community/versioning_policy.md +msgid "main" +msgstr "main" + +#: ../../community/versioning_policy.md +msgid "CI commitment for vLLM main branch and vLLM 0.9.2 branch" +msgstr "vLLM 主分支和 vLLM 0.9.2 分支的 CI 承诺" + +#: ../../community/versioning_policy.md +msgid "v0.9.1-dev" +msgstr "v0.9.1-dev" + +#: ../../community/versioning_policy.md +msgid "CI commitment for vLLM 0.9.1 version" +msgstr "vLLM 0.9.1 版本的 CI 承诺" + +#: ../../community/versioning_policy.md +msgid "v0.7.3-dev" +msgstr "v0.7.3-dev" + +#: ../../community/versioning_policy.md +msgid "CI commitment for vLLM 0.7.3 version" +msgstr "vLLM 0.7.3 版本的 CI 承诺" + +#: ../../community/versioning_policy.md +msgid "v0.7.1-dev" +msgstr "v0.7.1-dev" + +#: ../../community/versioning_policy.md +msgid "Replaced by v0.7.3-dev" +msgstr "已被 v0.7.3-dev 替代" + +#: ../../community/versioning_policy.md:84 +msgid "Backward compatibility" +msgstr "向后兼容性" + +#: ../../community/versioning_policy.md:86 +msgid "" +"For main branch, vLLM Ascend should works with vLLM main branch and latest 1" +" or 2 release version. So to ensure the backward compatibility, we will do " +"the following:" +msgstr "" +"对于主分支,vLLM Ascend 应该与 vLLM 主分支以及最新的 1 或 2 个发布版本兼容。因此,为了确保向后兼容性,我们将执行以下操作:" + +#: ../../community/versioning_policy.md:87 +msgid "" +"Both main branch and target vLLM release is tested by Ascend E2E CI. For " +"example, currently, vLLM main branch and vLLM 0.8.4 are tested now." +msgstr "主分支和目标 vLLM 发行版都经过了 Ascend E2E CI 的测试。例如,目前正在测试 vLLM 主分支和 vLLM 0.8.4。" + +#: ../../community/versioning_policy.md:88 +msgid "" +"For code changes, we will make sure that the changes are compatible with the" +" latest 1 or 2 vLLM release version as well. In this case, vLLM Ascend " +"introduced a version check machinism inner the code. It'll check the version" +" of installed vLLM package first to decide which code logic to use. If users" +" hit the `InvalidVersion` error, it sometimes means that they have installed" +" an dev/editable version of vLLM package. In this case, we provide the env " +"variable `VLLM_VERSION` to let users specify the version of vLLM package to " +"use." +msgstr "" +"对于代码更改,我们也会确保这些更改与最新的 1 或 2 个 vLLM 发行版本兼容。在这种情况下,vLLM Ascend " +"在代码中引入了版本检查机制。它会先检查已安装的 vLLM 包的版本,然后决定使用哪段代码逻辑。如果用户遇到 `InvalidVersion` " +"错误,这有时意味着他们安装了 dev/可编辑版本的 vLLM 包。此时,我们提供了环境变量 `VLLM_VERSION`,让用户可以指定要使用的 " +"vLLM 包版本。" + +#: ../../community/versioning_policy.md:89 +msgid "" +"For documentation changes, we will make sure that the changes are compatible" +" with the latest 1 or 2 vLLM release version as well. Note should be added " +"if there are any breaking changes." +msgstr "对于文档更改,我们会确保这些更改也兼容于最新的1个或2个 vLLM 发布版本。如果有任何重大变更,应添加说明。" + +#: ../../community/versioning_policy.md:91 +msgid "Document Branch Policy" +msgstr "文档分支政策" + +#: ../../community/versioning_policy.md:92 +msgid "" +"To reduce maintenance costs, **all branch documentation content should " +"remain consistent, and version differences can be controlled via variables " +"in [docs/source/conf.py](https://github.com/vllm-project/vllm-" +"ascend/blob/main/docs/source/conf.py)**. While this is not a simple task, it" +" is a principle we should strive to follow." +msgstr "" +"为了减少维护成本,**所有分支的文档内容应保持一致,版本差异可以通过 " +"[docs/source/conf.py](https://github.com/vllm-project/vllm-" +"ascend/blob/main/docs/source/conf.py) 中的变量进行控制**。虽然这并非易事,但这是我们应当努力遵循的原则。" + +#: ../../community/versioning_policy.md +msgid "Version" +msgstr "版本" + +#: ../../community/versioning_policy.md +msgid "Purpose" +msgstr "用途" + +#: ../../community/versioning_policy.md +msgid "Code Branch" +msgstr "代码分支" + +#: ../../community/versioning_policy.md +msgid "latest" +msgstr "最新" + +#: ../../community/versioning_policy.md +msgid "Doc for the latest dev branch" +msgstr "最新开发分支的文档" + +#: ../../community/versioning_policy.md +msgid "vX.Y.Z-dev (Will be `main` after the first final release)" +msgstr "vX.Y.Z-dev(在第一个正式版本发布后将成为 `main`)" + +#: ../../community/versioning_policy.md +msgid "version" +msgstr "版本" + +#: ../../community/versioning_policy.md +msgid "Doc for historical released versions" +msgstr "历史版本文档" + +#: ../../community/versioning_policy.md +msgid "Git tags, like vX.Y.Z[rcN]" +msgstr "Git 标签,如 vX.Y.Z[rcN]" + +#: ../../community/versioning_policy.md +msgid "stable(not yet released)" +msgstr "稳定版(尚未发布)" + +#: ../../community/versioning_policy.md +msgid "Doc for latest final release branch" +msgstr "最新正式发布分支的文档" + +#: ../../community/versioning_policy.md +msgid "Will be `vX.Y.Z-dev` after the first official release" +msgstr "首个正式发布后将会是 `vX.Y.Z-dev`" + +#: ../../community/versioning_policy.md:100 +msgid "As shown above:" +msgstr "如上所示:" + +#: ../../community/versioning_policy.md:102 +msgid "" +"`latest` documentation: Matches the current maintenance branch `vX.Y.Z-dev` " +"(Will be `main` after the first final release). Continuously updated to " +"ensure usability for the latest release." +msgstr "" +"`latest` 文档:匹配当前维护分支 `vX.Y.Z-dev`(在首次正式发布后将为 `main`)。持续更新,以确保适用于最新发布版本。" + +#: ../../community/versioning_policy.md:103 +msgid "" +"`version` documentation: Corresponds to specific released versions (e.g., " +"`v0.7.3`, `v0.7.3rc1`). No further updates after release." +msgstr "`version` 文档:对应特定的已发布版本(例如,`v0.7.3`、`v0.7.3rc1`)。发布后不再进行更新。" + +#: ../../community/versioning_policy.md:104 +msgid "" +"`stable` documentation (**not yet released**): Official release " +"documentation. Updates are allowed in real-time after release, typically " +"based on vX.Y.Z-dev. Once stable documentation is available, non-stable " +"versions should display a header warning: `You are viewing the latest " +"developer preview docs. Click here to view docs for the latest stable " +"release.`." +msgstr "" +"`stable` 文档(**尚未发布**):官方发布版文档。发布后允许实时更新,通常基于 " +"vX.Y.Z-dev。一旦稳定版文档可用,非稳定版本应显示一个顶部警告:`您正在查看最新的开发预览文档。点击此处查看最新稳定版本文档。`" + +#: ../../community/versioning_policy.md:106 +msgid "Software Dependency Management" +msgstr "软件依赖管理" + +#: ../../community/versioning_policy.md:107 +msgid "" +"`torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable " +"version to [PyPi](https://pypi.org/project/torch-npu) every 3 months, a " +"development version (aka the POC version) every month, and a nightly version" +" every day. The PyPi stable version **CAN** be used in vLLM Ascend final " +"version, the monthly dev version **ONLY CANN** be used in vLLM Ascend RC " +"version for rapid iteration, the nightly version **CANNOT** be used in vLLM " +"Ascend any version and branches." +msgstr "" +"`torch-npu`:Ascend Extension for PyTorch(torch-npu)每 3 个月会在 " +"[PyPi](https://pypi.org/project/torch-npu) 上发布一个稳定版本,每个月发布一个开发版本(即 POC " +"版本),每天发布一个 nightly 版本。PyPi 上的稳定版本**可以**用于 vLLM Ascend 的正式版本,月度开发版本**只能**用于 " +"vLLM Ascend 的 RC(候选发布)版本以便快速迭代,nightly 版本**不能**用于 vLLM Ascend 的任何版本和分支。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po new file mode 100644 index 0000000..9e45711 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po @@ -0,0 +1,187 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/contribution/index.md:107 +msgid "Index" +msgstr "索引" + +#: ../../developer_guide/contribution/index.md:1 +msgid "Contributing" +msgstr "贡献" + +#: ../../developer_guide/contribution/index.md:3 +msgid "Building and testing" +msgstr "构建与测试" + +#: ../../developer_guide/contribution/index.md:4 +msgid "" +"It's recommended to set up a local development environment to build and test" +" before you submit a PR." +msgstr "建议先搭建本地开发环境来进行构建和测试,再提交 PR。" + +#: ../../developer_guide/contribution/index.md:7 +msgid "Setup development environment" +msgstr "搭建开发环境" + +#: ../../developer_guide/contribution/index.md:9 +msgid "" +"Theoretically, the vllm-ascend build is only supported on Linux because " +"`vllm-ascend` dependency `torch_npu` only supports Linux." +msgstr "" +"理论上,vllm-ascend 构建仅支持 Linux,因为 `vllm-ascend` 的依赖项 `torch_npu` 只支持 Linux。" + +#: ../../developer_guide/contribution/index.md:12 +msgid "" +"But you can still set up dev env on Linux/Windows/macOS for linting and " +"basic test as following commands:" +msgstr "但你仍然可以在 Linux/Windows/macOS 上按照以下命令设置开发环境,用于代码规约检查和基本测试:" + +#: ../../developer_guide/contribution/index.md:15 +msgid "Run lint locally" +msgstr "在本地运行 lint" + +#: ../../developer_guide/contribution/index.md:33 +msgid "Run CI locally" +msgstr "本地运行CI" + +#: ../../developer_guide/contribution/index.md:35 +msgid "After complete \"Run lint\" setup, you can run CI locally:" +msgstr "在完成“运行 lint”设置后,你可以在本地运行 CI:" + +#: ../../developer_guide/contribution/index.md:61 +msgid "Submit the commit" +msgstr "提交该提交" + +#: ../../developer_guide/contribution/index.md:68 +msgid "" +"🎉 Congratulations! You have completed the development environment setup." +msgstr "🎉 恭喜!你已经完成了开发环境的搭建。" + +#: ../../developer_guide/contribution/index.md:70 +msgid "Test locally" +msgstr "本地测试" + +#: ../../developer_guide/contribution/index.md:72 +msgid "" +"You can refer to [Testing](./testing.md) doc to help you setup testing " +"environment and running tests locally." +msgstr "你可以参考 [测试](./testing.md) 文档,帮助你搭建测试环境并在本地运行测试。" + +#: ../../developer_guide/contribution/index.md:74 +msgid "DCO and Signed-off-by" +msgstr "DCO 和签名确认" + +#: ../../developer_guide/contribution/index.md:76 +msgid "" +"When contributing changes to this project, you must agree to the DCO. " +"Commits must include a `Signed-off-by:` header which certifies agreement " +"with the terms of the DCO." +msgstr "当为本项目贡献更改时,您必须同意 DCO。提交必须包含 `Signed-off-by:` 头部,以证明您同意 DCO 的条款。" + +#: ../../developer_guide/contribution/index.md:78 +msgid "Using `-s` with `git commit` will automatically add this header." +msgstr "在使用 `git commit` 时加上 `-s` 参数会自动添加这个头部信息。" + +#: ../../developer_guide/contribution/index.md:80 +msgid "PR Title and Classification" +msgstr "PR 标题与分类" + +#: ../../developer_guide/contribution/index.md:82 +msgid "" +"Only specific types of PRs will be reviewed. The PR title is prefixed " +"appropriately to indicate the type of change. Please use one of the " +"following:" +msgstr "只有特定类型的 PR 会被审核。PR 标题应使用合适的前缀以指明更改类型。请使用以下之一:" + +#: ../../developer_guide/contribution/index.md:84 +msgid "`[Attention]` for new features or optimization in attention." +msgstr "`[Attention]` 用于注意力机制中新特性或优化。" + +#: ../../developer_guide/contribution/index.md:85 +msgid "`[Communicator]` for new features or optimization in communicators." +msgstr "`[Communicator]` 适用于通信器中的新特性或优化。" + +#: ../../developer_guide/contribution/index.md:86 +msgid "`[ModelRunner]` for new features or optimization in model runner." +msgstr "`[ModelRunner]` 用于模型运行器中的新功能或优化。" + +#: ../../developer_guide/contribution/index.md:87 +msgid "`[Platform]` for new features or optimization in platform." +msgstr "`[Platform]` 用于平台中新功能或优化。" + +#: ../../developer_guide/contribution/index.md:88 +msgid "`[Worker]` for new features or optimization in worker." +msgstr "`[Worker]` 用于 worker 的新功能或优化。" + +#: ../../developer_guide/contribution/index.md:89 +msgid "" +"`[Core]` for new features or optimization in the core vllm-ascend logic " +"(such as platform, attention, communicators, model runner)" +msgstr "`[Core]` 用于核心 vllm-ascend 逻辑中的新特性或优化(例如平台、注意力机制、通信器、模型运行器)。" + +#: ../../developer_guide/contribution/index.md:90 +msgid "`[Kernel]` changes affecting compute kernels and ops." +msgstr "`[Kernel]` 影响计算内核和操作的更改。" + +#: ../../developer_guide/contribution/index.md:91 +msgid "`[Bugfix]` for bug fixes." +msgstr "`[Bugfix]` 用于表示错误修复。" + +#: ../../developer_guide/contribution/index.md:92 +msgid "`[Doc]` for documentation fixes and improvements." +msgstr "`[Doc]` 用于文档修复和改进。" + +#: ../../developer_guide/contribution/index.md:93 +msgid "`[Test]` for tests (such as unit tests)." +msgstr "`[Test]` 用于测试(如单元测试)。" + +#: ../../developer_guide/contribution/index.md:94 +msgid "`[CI]` for build or continuous integration improvements." +msgstr "`[CI]` 用于构建或持续集成的改进。" + +#: ../../developer_guide/contribution/index.md:95 +msgid "" +"`[Misc]` for PRs that do not fit the above categories. Please use this " +"sparingly." +msgstr "对于不属于上述类别的 PR,请使用 `[Misc]`。请谨慎使用此标签。" + +#: ../../developer_guide/contribution/index.md:98 +msgid "" +"If the PR spans more than one category, please include all relevant " +"prefixes." +msgstr "如果拉取请求(PR)涵盖多个类别,请包含所有相关的前缀。" + +#: ../../developer_guide/contribution/index.md:101 +msgid "Others" +msgstr "其他" + +#: ../../developer_guide/contribution/index.md:103 +msgid "" +"You may find more information about contributing to vLLM Ascend backend " +"plugin on " +"[docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html)." +" If you find any problem when contributing, you can feel free to submit a PR" +" to improve the doc to help other developers." +msgstr "" +"你可以在 " +"[docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html)" +" 上找到有关为 vLLM Ascend 后端插件做贡献的更多信息。如果你在贡献过程中遇到任何问题,欢迎随时提交 PR 来改进文档,以帮助其他开发者。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po new file mode 100644 index 0000000..8a9ca91 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po @@ -0,0 +1,237 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/contribution/testing.md:1 +msgid "Testing" +msgstr "测试" + +#: ../../developer_guide/contribution/testing.md:3 +msgid "" +"This secition explains how to write e2e tests and unit tests to verify the " +"implementation of your feature." +msgstr "本节介绍如何编写端到端测试和单元测试,以验证你的功能实现。" + +#: ../../developer_guide/contribution/testing.md:5 +msgid "Setup test environment" +msgstr "设置测试环境" + +#: ../../developer_guide/contribution/testing.md:7 +msgid "" +"The fastest way to setup test environment is to use the main branch " +"container image:" +msgstr "搭建测试环境最快的方法是使用 main 分支的容器镜像:" + +#: ../../developer_guide/contribution/testing.md +msgid "Local (CPU)" +msgstr "本地(CPU)" + +#: ../../developer_guide/contribution/testing.md:18 +msgid "You can run the unit tests on CPU with the following steps:" +msgstr "你可以按照以下步骤在 CPU 上运行单元测试:" + +#: ../../developer_guide/contribution/testing.md +msgid "Single card" +msgstr "单张卡片" + +#: ../../developer_guide/contribution/testing.md:85 +#: ../../developer_guide/contribution/testing.md:123 +msgid "" +"After starting the container, you should install the required packages:" +msgstr "启动容器后,你应该安装所需的软件包:" + +#: ../../developer_guide/contribution/testing.md +msgid "Multi cards" +msgstr "多卡" + +#: ../../developer_guide/contribution/testing.md:137 +msgid "Running tests" +msgstr "运行测试" + +#: ../../developer_guide/contribution/testing.md:139 +msgid "Unit test" +msgstr "单元测试" + +#: ../../developer_guide/contribution/testing.md:141 +msgid "There are several principles to follow when writing unit tests:" +msgstr "编写单元测试时需要遵循几个原则:" + +#: ../../developer_guide/contribution/testing.md:143 +msgid "" +"The test file path should be consistent with source file and start with " +"`test_` prefix, such as: `vllm_ascend/worker/worker_v1.py` --> " +"`tests/ut/worker/test_worker_v1.py`" +msgstr "" +"测试文件的路径应与源文件保持一致,并以 `test_` 前缀开头,例如:`vllm_ascend/worker/worker_v1.py` --> " +"`tests/ut/worker/test_worker_v1.py`" + +#: ../../developer_guide/contribution/testing.md:144 +msgid "" +"The vLLM Ascend test are using unittest framework, see " +"[here](https://docs.python.org/3/library/unittest.html#module-unittest) to " +"understand how to write unit tests." +msgstr "" +"vLLM Ascend 测试使用 unittest " +"框架,参见[这里](https://docs.python.org/3/library/unittest.html#module-" +"unittest)了解如何编写单元测试。" + +#: ../../developer_guide/contribution/testing.md:145 +msgid "" +"All unit tests can be run on CPU, so you must mock the device-related " +"function to host." +msgstr "所有单元测试都可以在 CPU 上运行,因此你必须将与设备相关的函数模拟为 host。" + +#: ../../developer_guide/contribution/testing.md:146 +msgid "" +"Example: [tests/ut/test_ascend_config.py](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/ut/test_ascend_config.py)." +msgstr "" +"示例:[tests/ut/test_ascend_config.py](https://github.com/vllm-project/vllm-" +"ascend/blob/main/tests/ut/test_ascend_config.py)。" + +#: ../../developer_guide/contribution/testing.md:147 +msgid "You can run the unit tests using `pytest`:" +msgstr "你可以使用 `pytest` 运行单元测试:" + +#: ../../developer_guide/contribution/testing.md +msgid "Multi cards test" +msgstr "多卡测试" + +#: ../../developer_guide/contribution/testing.md:192 +msgid "E2E test" +msgstr "端到端测试" + +#: ../../developer_guide/contribution/testing.md:194 +msgid "" +"Although vllm-ascend CI provide [e2e test](https://github.com/vllm-" +"project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml) on " +"Ascend CI, you can run it locally." +msgstr "" +"虽然 vllm-ascend CI 在 Ascend CI 上提供了 [端到端测试](https://github.com/vllm-" +"project/vllm-" +"ascend/blob/main/.github/workflows/vllm_ascend_test.yaml),你也可以在本地运行它。" + +#: ../../developer_guide/contribution/testing.md:204 +msgid "You can't run e2e test on CPU." +msgstr "你无法在 CPU 上运行 e2e 测试。" + +#: ../../developer_guide/contribution/testing.md:240 +msgid "" +"This will reproduce e2e test: " +"[vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-" +"ascend/blob/main/.github/workflows/vllm_ascend_test.yaml)." +msgstr "" +"这将复现端到端测试:[vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-" +"ascend/blob/main/.github/workflows/vllm_ascend_test.yaml)。" + +#: ../../developer_guide/contribution/testing.md:242 +msgid "E2E test example:" +msgstr "E2E 测试示例:" + +#: ../../developer_guide/contribution/testing.md:244 +msgid "" +"Offline test example: " +"[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-" +"project/vllm-" +"ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)" +msgstr "" +"离线测试示例:[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-" +"project/vllm-" +"ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)" + +#: ../../developer_guide/contribution/testing.md:245 +msgid "" +"Online test examples: " +"[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)" +msgstr "" +"在线测试示例:[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)" + +#: ../../developer_guide/contribution/testing.md:246 +msgid "" +"Correctness test example: " +"[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" +msgstr "" +"正确性测试示例:[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-" +"project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)" + +#: ../../developer_guide/contribution/testing.md:247 +msgid "" +"Reduced Layer model test example: [test_torchair_graph_mode.py - " +"DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-" +"ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)" +msgstr "" +"简化层模型测试示例:[test_torchair_graph_mode.py - " +"DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-" +"ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)" + +#: ../../developer_guide/contribution/testing.md:249 +msgid "" +"The CI resource is limited, you might need to reduce layer number of the " +"model, below is an example of how to generate a reduced layer model:" +msgstr "CI 资源有限,您可能需要减少模型的层数,下面是一个生成减少层数模型的示例:" + +#: ../../developer_guide/contribution/testing.md:250 +msgid "" +"Fork the original model repo in modelscope, we need all the files in the " +"repo except for weights." +msgstr "在 modelscope 中 fork 原始模型仓库,我们需要仓库中的所有文件,除了权重文件。" + +#: ../../developer_guide/contribution/testing.md:251 +#, python-brace-format +msgid "" +"Set `num_hidden_layers` to the expected number of layers, e.g., " +"`{\"num_hidden_layers\": 2,}`" +msgstr "将 `num_hidden_layers` 设置为期望的层数,例如 `{\"num_hidden_layers\": 2,}`" + +#: ../../developer_guide/contribution/testing.md:252 +msgid "" +"Copy the following python script as `generate_random_weight.py`. Set the " +"relevant parameters `MODEL_LOCAL_PATH`, `DIST_DTYPE` and `DIST_MODEL_PATH` " +"as needed:" +msgstr "" +"将以下 Python 脚本复制为 `generate_random_weight.py`。根据需要设置相关参数 " +"`MODEL_LOCAL_PATH`、`DIST_DTYPE` 和 `DIST_MODEL_PATH`:" + +#: ../../developer_guide/contribution/testing.md:270 +msgid "Run doctest" +msgstr "运行 doctest" + +#: ../../developer_guide/contribution/testing.md:272 +msgid "" +"vllm-ascend provides a `vllm-ascend/tests/e2e/run_doctests.sh` command to " +"run all doctests in the doc files. The doctest is a good way to make sure " +"the docs are up to date and the examples are executable, you can run it " +"locally as follows:" +msgstr "" +"vllm-ascend 提供了一个 `vllm-ascend/tests/e2e/run_doctests.sh` 命令,用于运行文档文件中的所有 " +"doctest。doctest 是确保文档保持最新且示例可执行的好方法,你可以按照以下方式在本地运行它:" + +#: ../../developer_guide/contribution/testing.md:280 +msgid "" +"This will reproduce the same environment as the CI: " +"[vllm_ascend_doctest.yaml](https://github.com/vllm-project/vllm-" +"ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml)." +msgstr "" +"这将复现与 CI 相同的环境:[vllm_ascend_doctest.yaml](https://github.com/vllm-" +"project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po new file mode 100644 index 0000000..180606e --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/accuracy_report/index.md:1 +#: ../../developer_guide/evaluation/accuracy_report/index.md:3 +msgid "Accuracy Report" +msgstr "准确性报告" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po new file mode 100644 index 0000000..4ea94e3 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/index.md:1 +#: ../../developer_guide/evaluation/index.md:3 +msgid "Accuracy" +msgstr "准确性" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po new file mode 100644 index 0000000..60ecb04 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po @@ -0,0 +1,112 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/using_evalscope.md:1 +msgid "Using EvalScope" +msgstr "使用 EvalScope" + +#: ../../developer_guide/evaluation/using_evalscope.md:3 +msgid "" +"This document will guide you have model inference stress testing and " +"accuracy testing using [EvalScope](https://github.com/modelscope/evalscope)." +msgstr "" +"本文档将指导您如何使用 [EvalScope](https://github.com/modelscope/evalscope) " +"进行模型推理压力测试和精度测试。" + +#: ../../developer_guide/evaluation/using_evalscope.md:5 +msgid "1. Online serving" +msgstr "1. 在线服务" + +#: ../../developer_guide/evaluation/using_evalscope.md:7 +msgid "You can run docker container to start the vLLM server on a single NPU:" +msgstr "你可以运行 docker 容器,在单个 NPU 上启动 vLLM 服务器:" + +#: ../../developer_guide/evaluation/using_evalscope.md:34 +msgid "If your service start successfully, you can see the info shown below:" +msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#: ../../developer_guide/evaluation/using_evalscope.md:42 +msgid "" +"Once your server is started, you can query the model with input prompts in " +"new terminal:" +msgstr "一旦你的服务器启动后,你可以在新的终端中用输入提示词查询模型:" + +#: ../../developer_guide/evaluation/using_evalscope.md:55 +msgid "2. Install EvalScope using pip" +msgstr "2. 使用 pip 安装 EvalScope" + +#: ../../developer_guide/evaluation/using_evalscope.md:57 +msgid "You can install EvalScope by using:" +msgstr "你可以使用以下方式安装 EvalScope:" + +#: ../../developer_guide/evaluation/using_evalscope.md:65 +msgid "3. Run gsm8k accuracy test using EvalScope" +msgstr "3. 使用 EvalScope 运行 gsm8k 准确率测试" + +#: ../../developer_guide/evaluation/using_evalscope.md:67 +msgid "You can `evalscope eval` run gsm8k accuracy test:" +msgstr "你可以使用 `evalscope eval` 运行 gsm8k 准确率测试:" + +#: ../../developer_guide/evaluation/using_evalscope.md:78 +#: ../../developer_guide/evaluation/using_evalscope.md:114 +msgid "After 1-2 mins, the output is as shown below:" +msgstr "1-2 分钟后,输出如下所示:" + +#: ../../developer_guide/evaluation/using_evalscope.md:88 +msgid "" +"See more detail in: [EvalScope doc - Model API Service " +"Evaluation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-" +"api-service-evaluation)." +msgstr "" +"更多详情请见:[EvalScope 文档 - 模型 API " +"服务评测](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-" +"api-service-evaluation)。" + +#: ../../developer_guide/evaluation/using_evalscope.md:90 +msgid "4. Run model inference stress testing using EvalScope" +msgstr "4. 使用 EvalScope 运行模型推理压力测试" + +#: ../../developer_guide/evaluation/using_evalscope.md:92 +msgid "Install EvalScope[perf] using pip" +msgstr "使用 pip 安装 EvalScope[perf]" + +#: ../../developer_guide/evaluation/using_evalscope.md:98 +msgid "Basic usage" +msgstr "基本用法" + +#: ../../developer_guide/evaluation/using_evalscope.md:100 +msgid "You can use `evalscope perf` run perf test:" +msgstr "你可以使用 `evalscope perf` 运行性能测试:" + +#: ../../developer_guide/evaluation/using_evalscope.md:112 +msgid "Output results" +msgstr "输出结果" + +#: ../../developer_guide/evaluation/using_evalscope.md:173 +msgid "" +"See more detail in: [EvalScope doc - Model Inference Stress " +"Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-" +"usage)." +msgstr "" +"更多详情见:[EvalScope 文档 - " +"模型推理压力测试](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-" +"usage)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po new file mode 100644 index 0000000..69c52cb --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po @@ -0,0 +1,65 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/using_lm_eval.md:1 +msgid "Using lm-eval" +msgstr "使用 lm-eval" + +#: ../../developer_guide/evaluation/using_lm_eval.md:2 +msgid "" +"This document will guide you have a accuracy testing using [lm-" +"eval](https://github.com/EleutherAI/lm-evaluation-harness)." +msgstr "" +"本文将指导你如何使用 [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) " +"进行准确率测试。" + +#: ../../developer_guide/evaluation/using_lm_eval.md:4 +msgid "1. Run docker container" +msgstr "1. 运行 docker 容器" + +#: ../../developer_guide/evaluation/using_lm_eval.md:6 +msgid "You can run docker container on a single NPU:" +msgstr "你可以在单个NPU上运行docker容器:" + +#: ../../developer_guide/evaluation/using_lm_eval.md:33 +msgid "2. Run ceval accuracy test using lm-eval" +msgstr "2. 使用 lm-eval 运行 ceval 准确性测试" + +#: ../../developer_guide/evaluation/using_lm_eval.md:34 +msgid "Install lm-eval in the container." +msgstr "在容器中安装 lm-eval。" + +#: ../../developer_guide/evaluation/using_lm_eval.md:39 +msgid "Run the following command:" +msgstr "运行以下命令:" + +#: ../../developer_guide/evaluation/using_lm_eval.md:50 +msgid "After 1-2 mins, the output is as shown below:" +msgstr "1-2 分钟后,输出如下所示:" + +#: ../../developer_guide/evaluation/using_lm_eval.md:62 +msgid "" +"You can see more usage on [Lm-eval Docs](https://github.com/EleutherAI/lm-" +"evaluation-harness/blob/main/docs/README.md)." +msgstr "" +"你可以在 [Lm-eval 文档](https://github.com/EleutherAI/lm-evaluation-" +"harness/blob/main/docs/README.md) 上查看更多用法。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po new file mode 100644 index 0000000..41c00d6 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po @@ -0,0 +1,83 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/evaluation/using_opencompass.md:1 +msgid "Using OpenCompass" +msgstr "使用 OpenCompass" + +#: ../../developer_guide/evaluation/using_opencompass.md:2 +msgid "" +"This document will guide you have a accuracy testing using " +"[OpenCompass](https://github.com/open-compass/opencompass)." +msgstr "" +"本文档将指导你如何使用 [OpenCompass](https://github.com/open-compass/opencompass) " +"进行准确率测试。" + +#: ../../developer_guide/evaluation/using_opencompass.md:4 +msgid "1. Online Serving" +msgstr "1. 在线服务" + +#: ../../developer_guide/evaluation/using_opencompass.md:6 +msgid "You can run docker container to start the vLLM server on a single NPU:" +msgstr "你可以运行 docker 容器,在单个 NPU 上启动 vLLM 服务器:" + +#: ../../developer_guide/evaluation/using_opencompass.md:32 +msgid "If your service start successfully, you can see the info shown below:" +msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#: ../../developer_guide/evaluation/using_opencompass.md:39 +msgid "" +"Once your server is started, you can query the model with input prompts in " +"new terminal:" +msgstr "一旦你的服务器启动后,你可以在新的终端中用输入提示词查询模型:" + +#: ../../developer_guide/evaluation/using_opencompass.md:51 +msgid "2. Run ceval accuracy test using OpenCompass" +msgstr "2. 使用 OpenCompass 运行 ceval 准确率测试" + +#: ../../developer_guide/evaluation/using_opencompass.md:52 +msgid "" +"Install OpenCompass and configure the environment variables in the " +"container." +msgstr "在容器中安装 OpenCompass 并配置环境变量。" + +#: ../../developer_guide/evaluation/using_opencompass.md:64 +msgid "" +"Add `opencompass/configs/eval_vllm_ascend_demo.py` with the following " +"content:" +msgstr "添加 `opencompass/configs/eval_vllm_ascend_demo.py`,内容如下:" + +#: ../../developer_guide/evaluation/using_opencompass.md:104 +msgid "Run the following command:" +msgstr "运行以下命令:" + +#: ../../developer_guide/evaluation/using_opencompass.md:110 +msgid "After 1-2 mins, the output is as shown below:" +msgstr "1-2 分钟后,输出如下所示:" + +#: ../../developer_guide/evaluation/using_opencompass.md:120 +msgid "" +"You can see more usage on [OpenCompass " +"Docs](https://opencompass.readthedocs.io/en/latest/index.html)." +msgstr "" +"你可以在 [OpenCompass " +"文档](https://opencompass.readthedocs.io/en/latest/index.html) 查看更多用法。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po new file mode 100644 index 0000000..57525a3 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po @@ -0,0 +1,33 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/feature_guide/index.md:1 +#: ../../developer_guide/feature_guide/index.md:5 +msgid "Feature Guide" +msgstr "功能指南" + +#: ../../developer_guide/feature_guide/index.md:3 +msgid "" +"This section provides an overview of the features implemented in vLLM " +"Ascend. Developers can refer to this guide to understand how vLLM Ascend " +"works." +msgstr "本节概述了 vLLM Ascend 中实现的功能。开发者可以参考本指南以了解 vLLM Ascend 的工作原理。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po new file mode 100644 index 0000000..1e7daa4 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po @@ -0,0 +1,248 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/feature_guide/patch.md:1 +msgid "Patch in vLLM Ascend" +msgstr "在 vLLM Ascend 中的补丁" + +#: ../../developer_guide/feature_guide/patch.md:3 +msgid "" +"vLLM Ascend is a platform plugin for vLLM. Due to the release cycle of vLLM " +"and vLLM Ascend is different, and the hardware limitation in some case, we " +"need to patch some code in vLLM to make it compatible with vLLM Ascend." +msgstr "" +"vLLM Ascend 是 vLLM 的一个平台插件。由于 vLLM 和 vLLM Ascend " +"的发布周期不同,并且在某些情况下存在硬件限制,我们需要对 vLLM 进行一些代码补丁,以使其能够兼容 vLLM Ascend。" + +#: ../../developer_guide/feature_guide/patch.md:5 +msgid "" +"In vLLM Ascend code, we provide a patch module `vllm_ascend/patch` to " +"address the change for vLLM." +msgstr "在 vLLM Ascend 代码中,我们提供了一个补丁模块 `vllm_ascend/patch` 用于应对 vLLM 的变更。" + +#: ../../developer_guide/feature_guide/patch.md:7 +msgid "Principle" +msgstr "原理" + +#: ../../developer_guide/feature_guide/patch.md:9 +msgid "" +"We should keep in mind that Patch is not the best way to make vLLM Ascend " +"compatible. It's just a temporary solution. The best way is to contribute " +"the change to vLLM to make it compatible with vLLM Ascend originally. In " +"vLLM Ascend, we have the basic principle for Patch strategy:" +msgstr "" +"我们需要记住,Patch 不是让 vLLM 兼容 Ascend 的最佳方式,这只是一个临时的解决方案。最好的方法是将修改贡献到 vLLM 项目中,从而让" +" vLLM 原生支持 Ascend。对于 vLLM Ascend,我们对 Patch 策略有一个基本原则:" + +#: ../../developer_guide/feature_guide/patch.md:11 +msgid "Less is more. Please do not patch unless it's the only way currently." +msgstr "少即是多。请不要打补丁,除非这是目前唯一的方法。" + +#: ../../developer_guide/feature_guide/patch.md:12 +msgid "" +"Once a patch is added, it's required to describe the future plan for " +"removing the patch." +msgstr "一旦补丁被添加,必须说明将来移除该补丁的计划。" + +#: ../../developer_guide/feature_guide/patch.md:13 +msgid "Anytime, clean the patch code is welcome." +msgstr "任何时候,欢迎清理补丁代码。" + +#: ../../developer_guide/feature_guide/patch.md:15 +msgid "How it works" +msgstr "工作原理" + +#: ../../developer_guide/feature_guide/patch.md:17 +msgid "In `vllm_ascend/patch`, you can see the code structure as follows:" +msgstr "在 `vllm_ascend/patch` 目录中,你可以看到如下代码结构:" + +#: ../../developer_guide/feature_guide/patch.md:33 +msgid "" +"**platform**: The patch code in this directory is for patching the code in " +"vLLM main process. It's called by " +"`vllm_ascend/platform::NPUPlatform::pre_register_and_update` very early when" +" vLLM is initialized." +msgstr "" +"**platform**:此目录下的补丁代码用于修补 vLLM 主进程中的代码。当 vLLM 初始化时,会在很早的阶段由 " +"`vllm_ascend/platform::NPUPlatform::pre_register_and_update` 调用。" + +#: ../../developer_guide/feature_guide/patch.md:34 +msgid "" +"For online mode, vLLM process calls the platform patch here " +"`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing " +"the cli args." +msgstr "" +"对于在线模式,vLLM 进程在解析命令行参数时,会在 " +"`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` 这里调用平台补丁。" + +#: ../../developer_guide/feature_guide/patch.md:35 +msgid "" +"For offline mode, vLLM process calls the platform patch here " +"`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when " +"parsing the input parameters." +msgstr "" +"对于离线模式,vLLM 进程在解析输入参数时,会在此处调用平台补丁 " +"`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config`。" + +#: ../../developer_guide/feature_guide/patch.md:36 +msgid "" +"**worker**: The patch code in this directory is for patching the code in " +"vLLM worker process. It's called by " +"`vllm_ascend/worker/worker_v1::NPUWorker::__init__` when the vLLM worker " +"process is initialized." +msgstr "" +"**worker**:此目录中的补丁代码用于修补 vLLM worker 进程中的代码。在初始化 vLLM worker 进程时,会被 " +"`vllm_ascend/worker/worker_v1::NPUWorker::__init__` 调用。" + +#: ../../developer_guide/feature_guide/patch.md:37 +msgid "" +"For both online and offline mode, vLLM engine core process calls the worker " +"patch here `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` " +"when initializing the worker process." +msgstr "" +"无论是在线还是离线模式,vLLM 引擎核心进程在初始化 worker 进程时,都会在这里调用 worker " +"补丁:`vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker`。" + +#: ../../developer_guide/feature_guide/patch.md:39 +msgid "" +"In both **platform** and **worker** folder, there are several patch modules." +" They are used for patching different version of vLLM." +msgstr "在 **platform** 和 **worker** 文件夹中都有一些补丁模块。它们用于修补不同版本的 vLLM。" + +#: ../../developer_guide/feature_guide/patch.md:41 +msgid "" +"`patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is " +"always the nearest version of vLLM. Once vLLM is released, we will drop this" +" patch module and bump to a new version. For example, `patch_0_9_2` is used " +"for patching vLLM 0.9.2." +msgstr "" +"`patch_0_9_2`:此模块用于修补 vLLM 0.9.2。该版本始终对应于 vLLM 的最近版本。一旦 vLLM " +"发布新版本,我们将移除此补丁模块并升级到新版本。例如,`patch_0_9_2` 就是用于修补 vLLM 0.9.2 的。" + +#: ../../developer_guide/feature_guide/patch.md:42 +msgid "" +"`patch_main`: This module is used for patching the code in vLLM main branch." +msgstr "`patch_main`:该模块用于修补 vLLM 主分支代码。" + +#: ../../developer_guide/feature_guide/patch.md:43 +msgid "" +"`patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM " +"main branch." +msgstr "`patch_common`:此模块用于同时修补 vLLM 0.9.2 版本和 vLLM 主分支。" + +#: ../../developer_guide/feature_guide/patch.md:45 +msgid "How to write a patch" +msgstr "如何撰写补丁" + +#: ../../developer_guide/feature_guide/patch.md:47 +msgid "" +"Before writing a patch, following the principle above, we should patch the " +"least code. If it's necessary, we can patch the code in either **platform** " +"and **worker** folder. Here is an example to patch `distributed` module in " +"vLLM." +msgstr "" +"在编写补丁之前,遵循上述原则,我们应尽量修改最少的代码。如果有必要,我们可以修改 **platform** 和 **worker** " +"文件夹中的代码。下面是一个在 vLLM 中修改 `distributed` 模块的示例。" + +#: ../../developer_guide/feature_guide/patch.md:49 +msgid "" +"Decide which version of vLLM we should patch. For example, after analysis, " +"here we want to patch both 0.9.2 and main of vLLM." +msgstr "决定我们应该修补哪个版本的 vLLM。例如,经过分析后,这里我们想要同时修补 vLLM 的 0.9.2 版和主分支(main)。" + +#: ../../developer_guide/feature_guide/patch.md:50 +msgid "" +"Decide which process we should patch. For example, here `distributed` " +"belongs to the vLLM main process, so we should patch `platform`." +msgstr "决定我们应该修补哪个进程。例如,这里 `distributed` 属于 vLLM 主进程,所以我们应该修补 `platform`。" + +#: ../../developer_guide/feature_guide/patch.md:51 +#, python-brace-format +msgid "" +"Create the patch file in the right folder. The file should be named as " +"`patch_{module_name}.py`. The example here is " +"`vllm_ascend/patch/platform/patch_common/patch_distributed.py`." +msgstr "" +"在正确的文件夹中创建补丁文件。文件应命名为 `patch_{module_name}.py`。此处的示例是 " +"`vllm_ascend/patch/platform/patch_common/patch_distributed.py`。" + +#: ../../developer_guide/feature_guide/patch.md:52 +msgid "Write your patch code in the new file. Here is an example:" +msgstr "在新文件中编写你的补丁代码。以下是一个示例:" + +#: ../../developer_guide/feature_guide/patch.md:62 +msgid "" +"Import the patch file in `__init__.py`. In this example, add `import " +"vllm_ascend.patch.platform.patch_common.patch_distributed` into " +"`vllm_ascend/patch/platform/patch_common/__init__.py`." +msgstr "" +"在 `__init__.py` 中导入补丁文件。在这个示例中,将 `import " +"vllm_ascend.patch.platform.patch_common.patch_distributed` 添加到 " +"`vllm_ascend/patch/platform/patch_common/__init__.py` 中。" + +#: ../../developer_guide/feature_guide/patch.md:63 +msgid "" +"Add the description of the patch in `vllm_ascend/patch/__init__.py`. The " +"description format is as follows:" +msgstr "在 `vllm_ascend/patch/__init__.py` 中添加补丁的描述。描述格式如下:" + +#: ../../developer_guide/feature_guide/patch.md:77 +msgid "" +"Add the Unit Test and E2E Test. Any newly added code in vLLM Ascend should " +"contain the Unit Test and E2E Test as well. You can find more details in " +"[test guide](../contribution/testing.md)" +msgstr "" +"添加单元测试和端到端(E2E)测试。在 vLLM Ascend 中新增的任何代码也应包含单元测试和端到端测试。更多详情请参见 " +"[测试指南](../contribution/testing.md)。" + +#: ../../developer_guide/feature_guide/patch.md:80 +msgid "Limitation" +msgstr "限制" + +#: ../../developer_guide/feature_guide/patch.md:81 +msgid "" +"In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore " +"process and Worker process. Now vLLM Ascend only support patch the code in " +"Main process and Worker process by default. If you want to patch the code " +"runs in EngineCore process, you should patch EngineCore process entirely " +"during setup, the entry code is here `vllm.v1.engine.core`. Please override " +"`EngineCoreProc` and `DPEngineCoreProc` entirely." +msgstr "" +"在 V1 引擎中,vLLM 会启动三种类型的进程:主进程、EngineCore 进程和 Worker 进程。现在 vLLM Ascend " +"默认只支持在主进程和 Worker 进程中打补丁代码。如果你想要在 EngineCore 进程中打补丁,你需要在设置阶段对 EngineCore " +"进程整体打补丁,入口代码在 `vllm.v1.engine.core`。请完全重写 `EngineCoreProc` 和 " +"`DPEngineCoreProc`。" + +#: ../../developer_guide/feature_guide/patch.md:82 +msgid "" +"If you are running an edited vLLM code, the version of the vLLM may be " +"changed automatically. For example, if you runs an edited vLLM based on " +"v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the " +"patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM " +"Ascend can't distinguish the version of vLLM you're using. In this case, you" +" can set the environment variable `VLLM_VERSION` to specify the version of " +"vLLM you're using, then the patch for v0.9.2 should work." +msgstr "" +"如果你运行的是经过编辑的 vLLM 代码,vLLM 的版本可能会被自动更改。例如,如果你基于 v0.9.n 运行了编辑后的 vLLM,vLLM " +"的版本可能会变为 v0.9.nxxx,在这种情况下,vLLM Ascend 的 v0.9.n 补丁将无法正常工作,因为 vLLM Ascend " +"无法区分你所使用的 vLLM 版本。这时,你可以设置环境变量 `VLLM_VERSION` 来指定你所使用的 vLLM 版本,这样对 v0.9.2 " +"的补丁就应该可以正常工作。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po new file mode 100644 index 0000000..09f8f9e --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po @@ -0,0 +1,333 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/adding_a_new_model.md:1 +msgid "Adding a New Model" +msgstr "添加新模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:3 +msgid "" +"This guide demonstrates how to integrate a novel or customized model into " +"vllm-ascend. For foundational concepts, it is highly recommended to refer to" +" [vllm official doc: Adding a New " +"Model](https://docs.vllm.ai/en/stable/contributing/model/) first." +msgstr "" +"本指南演示如何将新颖或自定义的模型集成到 vllm-ascend 中。对于基础概念,强烈建议先参考 [vllm " +"官方文档:添加新模型](https://docs.vllm.ai/en/stable/contributing/model/)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:6 +msgid "Step 1: Implementing Models with `torch` and `torch_npu`" +msgstr "步骤 1:使用 `torch` 和 `torch_npu` 实现模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:8 +msgid "" +"This section provides instructions for implementing new models compatible " +"with vllm and vllm-ascend." +msgstr "本节提供了实现与 vllm 和 vllm-ascend 兼容的新模型的相关说明。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:10 +msgid "**Before starting:**" +msgstr "**开始之前:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:12 +msgid "" +"Verify whether your model already exists in vllm's " +"[models](https://github.com/vllm-" +"project/vllm/tree/main/vllm/model_executor/models) directory." +msgstr "" +"请确认你的模型是否已经存在于 vllm 的 [models](https://github.com/vllm-" +"project/vllm/tree/main/vllm/model_executor/models) 目录中。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:13 +msgid "" +"Use existing models' implementation as templates to accelerate your " +"development." +msgstr "使用已有模型的实现作为模板以加速您的开发。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:15 +msgid "Method 1: Implementing New Models from Scratch" +msgstr "方法一:从零开始实现新模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:17 +msgid "" +"Follow vllm's [OPT model " +"adaptation](https://docs.vllm.ai/en/stable/contributing/model/basic.html) " +"example for guidance." +msgstr "" +"请参考 vllm 的 [OPT " +"模型适配](https://docs.vllm.ai/en/stable/contributing/model/basic.html) 示例进行操作。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:19 +msgid "**Key implementation requirements:**" +msgstr "**关键实现要求:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:21 +msgid "Place model files in `vllm_ascend/models/` directory." +msgstr "请将模型文件放在 `vllm_ascend/models/` 目录下。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:23 +msgid "" +"Standard module structure for decoder-only LLMs (please checkout vllm's " +"implementations for other kinds of model):" +msgstr "解码器-only LLMs 的标准模块结构(请参考 vllm 对其他类型模型的实现):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:25 +msgid "`*ModelForCausalLM` (top-level wrapper)" +msgstr "`*ModelForCausalLM`(顶层包装器)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:26 +msgid "`*Model` (main architecture)" +msgstr "`*Model`(主架构)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:27 +msgid "`*DecoderLayer` (transformer block)" +msgstr "`*DecoderLayer` (transformer 块)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:28 +msgid "`*Attention` and `*MLP` (specific computation unit)" +msgstr "`*Attention` 和 `*MLP`(特定计算单元)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:31 +msgid "`*` denotes your model's unique identifier." +msgstr "`*` 表示你的模型的唯一标识符。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:34 +msgid "Critical Implementation Details:" +msgstr "关键实现细节:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:36 +msgid "All modules must include a `prefix` argument in `__init__()`." +msgstr "所有模块在 `__init__()` 方法中都必须包含一个 `prefix` 参数。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:38 +msgid "**Required interfaces:**" +msgstr "**必需的接口:**" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "Module Type" +msgstr "模块类型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "Required Methods" +msgstr "必需的方法" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`*ModelForCausalLM`" +msgstr "`*ModelForCausalLM`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`get_input_embeddings`, `compute_logits`, `load_weights`" +msgstr "`get_input_embeddings`,`compute_logits`,`load_weights`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`*Model`" +msgstr "`*模型`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:30 +msgid "`get_input_embeddings`, `load_weights`" +msgstr "`get_input_embeddings`,`load_weights`" + +#: ../../developer_guide/modeling/adding_a_new_model.md:45 +msgid "Attention Backend Integration:" +msgstr "注意后端集成:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:47 +msgid "" +"Importing attention via `from vllm.attention import Attention` can " +"automatically leverage the attention backend routing of vllm-ascend (see: " +"`get_attn_backend_cls()` in `vllm_ascend/platform.py`)." +msgstr "" +"通过 `from vllm.attention import Attention` 导入 attention 可以自动利用 vllm-ascend " +"的注意力后端路由(详见:`vllm_ascend/platform.py` 中的 `get_attn_backend_cls()`)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:49 +msgid "Tensor Parallelism:" +msgstr "张量并行:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:51 +msgid "" +"Use vllm's parallel layers (`ColumnParallelLinear`, " +"`VocabParallelEmbedding`, etc.) to implement models supporting tensor " +"parallelism. Note that Ascend-specific customizations are implemented in " +"`vllm_ascend/ops/` directory (RMSNorm, VocabParallelEmbedding, etc.)." +msgstr "" +"使用 vllm 的并行层(如 `ColumnParallelLinear`、`VocabParallelEmbedding` " +"等)来实现支持张量并行的模型。需要注意的是,Ascend 特有的自定义实现(如 RMSNorm、VocabParallelEmbedding 等)位于 " +"`vllm_ascend/ops/` 目录下。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:53 +msgid "" +"**Reference Implementation Template** (assumed path: " +"`vllm_ascend/models/custom_model.py`):" +msgstr "**参考实现模板**(假定路径:`vllm_ascend/models/custom_model.py`):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:135 +msgid "Method 2: Customizing Existing vLLM Models" +msgstr "方法二:自定义已有的 vLLM 模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:137 +msgid "" +"For most use cases, extending existing implementations is preferable. We " +"demonstrate an example to inherit from base classes and implement a custom " +"deepseek model below (assumed path: `vllm_ascend/models/deepseek_v2.py`)." +msgstr "" +"对于大多数使用场景,建议扩展已有的实现。我们在下面演示了一个示例,通过继承基类并实现一个自定义的 deepseek " +"模型(假定路径:`vllm_ascend/models/deepseek_v2.py`)。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:175 +msgid "" +"For a complete implementation reference, see: " +"`vllm_ascend/models/deepseek_v2.py`." +msgstr "完整的实现参考请见:`vllm_ascend/models/deepseek_v2.py`。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:178 +msgid "Step 2: Registering Custom Models using ModelRegistry Plugins in vLLM" +msgstr "第2步:使用 vLLM 中的 ModelRegistry 插件注册自定义模型" + +#: ../../developer_guide/modeling/adding_a_new_model.md:180 +msgid "" +"vllm provides a plugin mechanism for registering externally implemented " +"models without modifying its codebase." +msgstr "vllm 提供了一种插件机制,可用于注册外部实现的模型,而无需修改其代码库。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:182 +msgid "" +"To integrate your implemented model from `vllm_ascend/models/` directory:" +msgstr "要集成你在 `vllm_ascend/models/` 目录下实现的模型:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:184 +msgid "" +"Import your model implementation in `vllm_ascend/models/__init__.py` using " +"relative imports." +msgstr "使用相对导入在 `vllm_ascend/models/__init__.py` 中导入你的模型实现。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:185 +msgid "" +"Register the model wrapper class via `vllm.ModelRegistry.register_model()` " +"function." +msgstr "通过 `vllm.ModelRegistry.register_model()` 函数注册模型包装类。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:187 +msgid "" +"**Reference Registration Template** (an example of registering new models in" +" `vllm_ascend/models/__init__.py`):" +msgstr "**参考注册模板**(在 `vllm_ascend/models/__init__.py` 注册新模型的示例):" + +#: ../../developer_guide/modeling/adding_a_new_model.md:210 +msgid "" +"The first argument of `vllm.ModelRegistry.register_model()` indicates the " +"unique architecture identifier which must match `architectures` in " +"`config.json` of the model." +msgstr "" +"`vllm.ModelRegistry.register_model()` 的第一个参数表示唯一的架构标识符,这个标识符必须与模型的 " +"`config.json` 文件中的 `architectures` 匹配。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:221 +msgid "Step 3: Verification" +msgstr "第 3 步:验证" + +#: ../../developer_guide/modeling/adding_a_new_model.md:223 +msgid "Case 1: Overriding Existing vLLM Model Architecture" +msgstr "案例 1:重载已有的 vLLM 模型架构" + +#: ../../developer_guide/modeling/adding_a_new_model.md:225 +msgid "" +"If you're registering a customized model architecture based on vllm's " +"existing implementation (overriding vllm's original class), when executing " +"vllm offline/online inference (using any model), you'll observe warning logs" +" similar to the following output from " +"`vllm/models_executor/models/registry.py`." +msgstr "" +"如果你基于 vllm 的现有实现注册了一个自定义的模型架构(覆盖了 vllm 的原始类),在执行 vllm " +"的离线/在线推理(无论使用哪个模型)时,你会看到类似于 `vllm/models_executor/models/registry.py` " +"输出的警告日志。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:231 +msgid "Case 2: Registering New Model Architecture" +msgstr "案例2:注册新模型架构" + +#: ../../developer_guide/modeling/adding_a_new_model.md:233 +msgid "" +"If you're registering a novel model architecture not present in vllm " +"(creating a completely new class), current logs won't provide explicit " +"confirmation by default. It's recommended to add the following logging " +"statement at the end of the `register_model` method in " +"`vllm/models_executor/models/registry.py`." +msgstr "" +"如果你注册了 vllm 中不存在的新模型架构(创建一个全新的类),当前日志默认不会提供明确的确认信息。建议在 " +"`vllm/models_executor/models/registry.py` 文件中的 `register_model` " +"方法末尾添加如下日志语句。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:239 +msgid "" +"After adding this line, you will see confirmation logs shown below when " +"running vllm offline/online inference (using any model)." +msgstr "添加这一行之后,当你运行 vllm 离线/在线推理(使用任何模型)时,将会看到如下确认日志。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:245 +msgid "" +"This log output confirms your novel model architecture has been successfully" +" registered in vllm." +msgstr "该日志输出确认了你的新模型架构已成功在 vllm 中注册。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:247 +msgid "Step 4: Testing" +msgstr "第4步:测试" + +#: ../../developer_guide/modeling/adding_a_new_model.md:249 +msgid "" +"After adding a new model, we should do basic functional test (offline/online" +" inference), accuracy test and performance benchmark for the model." +msgstr "在添加新模型后,我们应对该模型进行基本功能测试(离线/在线推理)、准确率测试和性能基准测试。" + +#: ../../developer_guide/modeling/adding_a_new_model.md:251 +msgid "Find more details at:" +msgstr "更多详情请见:" + +#: ../../developer_guide/modeling/adding_a_new_model.md:253 +msgid "" +"[Accuracy test guide](https://vllm-" +"ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html)" +msgstr "" +"[精度测试指南](https://vllm-" +"ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:254 +msgid "" +"[Performance benchmark guide](https://vllm-" +"ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)" +msgstr "" +"[性能基准指南](https://vllm-" +"ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)" + +#: ../../developer_guide/modeling/adding_a_new_model.md:256 +msgid "Step 5: Updating Supported Models Doc" +msgstr "第5步:更新支持的模型文档" + +#: ../../developer_guide/modeling/adding_a_new_model.md:258 +msgid "" +"At last, if all the steps above are completed, you should add the new model " +"into our [Supported Models](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/supported_models.html) doc." +msgstr "" +"最后,如果以上所有步骤都已完成,你应该将新模型添加到我们的[支持的模型](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/supported_models.html)文档中。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po new file mode 100644 index 0000000..e0fd947 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po @@ -0,0 +1,29 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:1 +msgid "Adding a New Multi-Modal Model" +msgstr "添加新的多模态模型" + +#: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3 +msgid "**_Comming soon ..._**" +msgstr "**_敬请期待 ..._**" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po new file mode 100644 index 0000000..29a2982 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po @@ -0,0 +1,32 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/modeling/index.md:1 +#: ../../developer_guide/modeling/index.md:5 +msgid "Modeling" +msgstr "新模型" + +#: ../../developer_guide/modeling/index.md:3 +msgid "" +"This section provides tutorials of how to implement and register a new model" +" into vllm-ascend." +msgstr "本节提供了如何在 vllm-ascend 中实现并注册新模型的教程。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po new file mode 100644 index 0000000..c2b2e6f --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po @@ -0,0 +1,26 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/performance/index.md:1 +#: ../../developer_guide/performance/index.md:3 +msgid "Performance" +msgstr "性能" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po new file mode 100644 index 0000000..484edac --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po @@ -0,0 +1,88 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/performance/performance_benchmark.md:1 +msgid "Performance Benchmark" +msgstr "性能基准" + +#: ../../developer_guide/performance/performance_benchmark.md:2 +msgid "" +"This document details the benchmark methodology for vllm-ascend, aimed at " +"evaluating the performance under a variety of workloads. To maintain " +"alignment with vLLM, we use the [benchmark](https://github.com/vllm-" +"project/vllm/tree/main/benchmarks) script provided by the vllm project." +msgstr "" +"本文档详细说明了 vllm-ascend 的基准测试方法,旨在评估其在多种工作负载下的性能。为了与 vLLM 保持一致,我们使用 vllm 项目提供的 " +"[benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) 脚本。" + +#: ../../developer_guide/performance/performance_benchmark.md:4 +msgid "" +"**Benchmark Coverage**: We measure offline e2e latency and throughput, and " +"fixed-QPS online serving benchmarks, for more details see [vllm-ascend " +"benchmark scripts](https://github.com/vllm-project/vllm-" +"ascend/tree/main/benchmarks)." +msgstr "" +"**基准测试覆盖范围**:我们测量离线端到端延迟和吞吐量,以及固定 QPS 的在线服务基准测试。更多详情请参见 [vllm-ascend " +"基准测试脚本](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks)。" + +#: ../../developer_guide/performance/performance_benchmark.md:6 +msgid "1. Run docker container" +msgstr "1. 运行 docker 容器" + +#: ../../developer_guide/performance/performance_benchmark.md:31 +msgid "2. Install dependencies" +msgstr "2. 安装依赖项" + +#: ../../developer_guide/performance/performance_benchmark.md:38 +msgid "3. (Optional)Prepare model weights" +msgstr "3.(可选)准备模型权重" + +#: ../../developer_guide/performance/performance_benchmark.md:39 +msgid "" +"For faster running speed, we recommend downloading the model in advance:" +msgstr "为了更快的运行速度,建议提前下载模型:" + +#: ../../developer_guide/performance/performance_benchmark.md:44 +msgid "" +"You can also replace all model paths in the [json](https://github.com/vllm-" +"project/vllm-ascend/tree/main/benchmarks/tests) files with your local paths:" +msgstr "" +"你也可以将 [json](https://github.com/vllm-project/vllm-" +"ascend/tree/main/benchmarks/tests) 文件中的所有模型路径替换为你的本地路径:" + +#: ../../developer_guide/performance/performance_benchmark.md:60 +msgid "4. Run benchmark script" +msgstr "4. 运行基准测试脚本" + +#: ../../developer_guide/performance/performance_benchmark.md:61 +msgid "Run benchmark script:" +msgstr "运行基准测试脚本:" + +#: ../../developer_guide/performance/performance_benchmark.md:66 +msgid "After about 10 mins, the output is as shown below:" +msgstr "大约 10 分钟后,输出如下所示:" + +#: ../../developer_guide/performance/performance_benchmark.md:176 +msgid "" +"The result json files are generated into the path `benchmark/results` These " +"files contain detailed benchmarking results for further analysis." +msgstr "结果 json 文件会生成到路径 `benchmark/results`。这些文件包含了用于进一步分析的详细基准测试结果。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po new file mode 100644 index 0000000..7c83ca9 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po @@ -0,0 +1,81 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../developer_guide/performance/profile_execute_duration.md:1 +msgid "Profile Execute Duration" +msgstr "配置执行持续时间" + +#: ../../developer_guide/performance/profile_execute_duration.md:3 +msgid "" +"The execution duration of each stage (including pre/post-processing, model " +"forward, etc.) usually needs to be captured during a complete inference " +"process. Typically, this is done by using `torch.npu.synchronize()` and " +"obtaining CPU timestamps, which increases the performance overhead of " +"host/device synchronization." +msgstr "" +"在完整的推理过程中,通常需要记录每个阶段(包括前/后处理、模型前向等)的执行时长。一般通过使用 `torch.npu.synchronize()` " +"并获取 CPU 时间戳来实现,这会增加主机/设备同步的性能开销。" + +#: ../../developer_guide/performance/profile_execute_duration.md:5 +msgid "" +"**To reduce the performance overhead, we add this feature, using the NPU " +"event timestamp mechanism to observe the device execution time " +"asynchronously.**" +msgstr "**为了减少性能开销,我们添加了此功能,使用 NPU 事件时间戳机制异步观测设备的执行时间。**" + +#: ../../developer_guide/performance/profile_execute_duration.md:7 +msgid "Usage" +msgstr "用法" + +#: ../../developer_guide/performance/profile_execute_duration.md:8 +msgid "" +"Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to " +"enable this feature." +msgstr "使用环境变量 `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` 来启用此功能。" + +#: ../../developer_guide/performance/profile_execute_duration.md:9 +msgid "" +"Use the non-blocking API `ProfileExecuteDuration().capture_async` to set " +"observation points asynchronously when you need to observe the execution " +"duration." +msgstr "" +"当你需要观察执行时长时,可以使用非阻塞 API `ProfileExecuteDuration().capture_async` 异步设置观察点。" + +#: ../../developer_guide/performance/profile_execute_duration.md:10 +msgid "" +"Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an " +"appropriate time to get and print the execution durations of all observed " +"stages." +msgstr "" +"在适当的时机使用阻塞式 API `ProfileExecuteDuration().pop_captured_sync` " +"获取并打印所有已观察到阶段的执行时长。" + +#: ../../developer_guide/performance/profile_execute_duration.md:12 +msgid "" +"**We have instrumented the key inference stages (including pre-processing, " +"model forward pass, etc.) for execute duration profiling. Execute the script" +" as follows:**" +msgstr "**我们已经对关键的推理阶段(包括预处理、模型前向传递等)进行了执行时长分析的检测。请按如下方式执行脚本:**" + +#: ../../developer_guide/performance/profile_execute_duration.md:17 +msgid "Example Output" +msgstr "示例输出" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po b/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po new file mode 100644 index 0000000..f961f09 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po @@ -0,0 +1,479 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../faqs.md:1 +msgid "FAQs" +msgstr "" + +#: ../../faqs.md:3 +msgid "Version Specific FAQs" +msgstr "特定版本常见问题" + +#: ../../faqs.md:5 +msgid "" +"[[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-" +"ascend/issues/1007)" +msgstr "" +"[[v0.7.3.post1] 常见问题与反馈](https://github.com/vllm-project/vllm-" +"ascend/issues/1007)" + +#: ../../faqs.md:6 +msgid "" +"[[v0.9.2rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-" +"ascend/issues/1742)" +msgstr "" +"[[v0.9.2rc1] 常见问题与反馈](https://github.com/vllm-project/vllm-" +"ascend/issues/1742)" + +#: ../../faqs.md:8 +msgid "General FAQs" +msgstr "常见问题解答" + +#: ../../faqs.md:10 +msgid "1. What devices are currently supported?" +msgstr "1. 目前支持哪些设备?" + +#: ../../faqs.md:12 +msgid "" +"Currently, **ONLY** Atlas A2 series(Ascend-cann-kernels-910b) and Atlas " +"300I(Ascend-cann-kernels-310p) series are supported:" +msgstr "" +"目前,**仅**支持 Atlas A2 系列(Ascend-cann-kernels-910b)和 Atlas 300I(Ascend-cann-" +"kernels-310p)系列:" + +#: ../../faqs.md:14 +msgid "" +"Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 " +"Box16, Atlas 300T A2)" +msgstr "" +"Atlas A2 训练系列(Atlas 800T A2,Atlas 900 A2 PoD,Atlas 200T A2 Box16,Atlas 300T " +"A2)" + +#: ../../faqs.md:15 +msgid "Atlas 800I A2 Inference series (Atlas 800I A2)" +msgstr "Atlas 800I A2 推理系列(Atlas 800I A2)" + +#: ../../faqs.md:16 +msgid "Atlas 300I Inference series (Atlas 300I Duo)" +msgstr "Atlas 300I 推理系列(Atlas 300I Duo)" + +#: ../../faqs.md:18 +msgid "Below series are NOT supported yet:" +msgstr "以下系列目前尚不受支持:" + +#: ../../faqs.md:19 +msgid "Atlas 200I A2 (Ascend-cann-kernels-310b) unplanned yet" +msgstr "Atlas 200I A2(Ascend-cann-kernels-310b)尚未计划" + +#: ../../faqs.md:20 +msgid "Ascend 910, Ascend 910 Pro B (Ascend-cann-kernels-910) unplanned yet" +msgstr "Ascend 910,Ascend 910 Pro B(Ascend-cann-kernels-910)尚未计划" + +#: ../../faqs.md:22 +msgid "" +"From a technical view, vllm-ascend support would be possible if the torch-" +"npu is supported. Otherwise, we have to implement it by using custom ops. We" +" are also welcome to join us to improve together." +msgstr "" +"从技术角度来看,如果支持 torch-npu,则可以支持 vllm-ascend。否则,我们需要通过自定义算子来实现。我们也欢迎大家一起加入,共同改进。" + +#: ../../faqs.md:24 +msgid "2. How to get our docker containers?" +msgstr "2. 如何获取我们的 docker 容器?" + +#: ../../faqs.md:26 +msgid "" +"You can get our containers at `Quay.io`, e.g., [vllm-" +"ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and " +"[cann](https://quay.io/repository/ascend/cann?tab=tags)." +msgstr "" +"你可以在 `Quay.io` 获取我们的容器,例如,[vllm-" +"ascend](https://quay.io/repository/ascend/vllm-ascend?tab=tags) 和 " +"[cann](https://quay.io/repository/ascend/cann?tab=tags)。" + +#: ../../faqs.md:28 +msgid "" +"If you are in China, you can use `daocloud` to accelerate your downloading:" +msgstr "如果你在中国,可以使用 `daocloud` 来加速下载:" + +#: ../../faqs.md:36 +msgid "3. What models does vllm-ascend supports?" +msgstr "3. vllm-ascend 支持哪些模型?" + +#: ../../faqs.md:38 +msgid "" +"Find more details [here](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)." +msgstr "" +"在[此处](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)查看更多详细信息。" + +#: ../../faqs.md:40 +msgid "4. How to get in touch with our community?" +msgstr "4. 如何与我们的社区取得联系?" + +#: ../../faqs.md:42 +msgid "" +"There are many channels that you can communicate with our community " +"developers / users:" +msgstr "你可以通过多种渠道与我们的社区开发者/用户进行交流:" + +#: ../../faqs.md:44 +msgid "" +"Submit a GitHub [issue](https://github.com/vllm-project/vllm-" +"ascend/issues?page=1)." +msgstr "" +"提交一个 GitHub [issue](https://github.com/vllm-project/vllm-" +"ascend/issues?page=1)。" + +#: ../../faqs.md:45 +msgid "" +"Join our [weekly " +"meeting](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z)" +" and share your ideas." +msgstr "" +"加入我们的[每周会议](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z),并分享你的想法。" + +#: ../../faqs.md:46 +msgid "" +"Join our [WeChat](https://github.com/vllm-project/vllm-" +"ascend/issues/227) group and ask your quenstions." +msgstr "" +"加入我们的 [微信群](https://github.com/vllm-project/vllm-ascend/issues/227) " +"并提问你的问题。" + +#: ../../faqs.md:47 +msgid "" +"Join our ascend channel in [vLLM " +"forums](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-" +"support/6) and publish your topics." +msgstr "" +"加入我们在 [vLLM 论坛](https://discuss.vllm.ai/c/hardware-support/vllm-" +"ascend-support/6) 的 ascend 频道并发布你的话题。" + +#: ../../faqs.md:49 +msgid "5. What features does vllm-ascend V1 supports?" +msgstr "5. vllm-ascend V1 支持哪些功能?" + +#: ../../faqs.md:51 +msgid "" +"Find more details [here](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)." +msgstr "" +"在[这里](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)找到更多详细信息。" + +#: ../../faqs.md:53 +msgid "" +"6. How to solve the problem of \"Failed to infer device type\" or " +"\"libatb.so: cannot open shared object file\"?" +msgstr "6. 如何解决“无法推断设备类型”或“libatb.so:无法打开共享对象文件”问题?" + +#: ../../faqs.md:55 +msgid "" +"Basically, the reason is that the NPU environment is not configured " +"correctly. You can:" +msgstr "基本上,原因是 NPU 环境没有正确配置。你可以:" + +#: ../../faqs.md:56 +msgid "" +"try `source /usr/local/Ascend/nnal/atb/set_env.sh` to enable NNAL package." +msgstr "尝试运行 `source /usr/local/Ascend/nnal/atb/set_env.sh` 以启用 NNAL 包。" + +#: ../../faqs.md:57 +msgid "" +"try `source /usr/local/Ascend/ascend-toolkit/set_env.sh` to enable CANN " +"package." +msgstr "尝试运行 `source /usr/local/Ascend/ascend-toolkit/set_env.sh` 以启用 CANN 包。" + +#: ../../faqs.md:58 +msgid "try `npu-smi info` to check whether the NPU is working." +msgstr "尝试运行 `npu-smi info` 来检查 NPU 是否正常工作。" + +#: ../../faqs.md:60 +msgid "" +"If all above steps are not working, you can try the following code with " +"python to check whether there is any error:" +msgstr "如果以上所有步骤都无效,你可以尝试使用以下 python 代码来检查是否有错误:" + +#: ../../faqs.md:68 +msgid "" +"If all above steps are not working, feel free to submit a GitHub issue." +msgstr "如果以上所有步骤都无法解决问题,欢迎提交一个 GitHub issue。" + +#: ../../faqs.md:70 +msgid "7. How does vllm-ascend perform?" +msgstr "7. vllm-ascend 的性能如何?" + +#: ../../faqs.md:72 +msgid "" +"Currently, only some models are improved. Such as `Qwen2.5 VL`, `Qwen3`, " +"`Deepseek V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek" +" works with graph mode to play a good performance. What's more, you can " +"install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference " +"as well." +msgstr "" +"目前,只有部分模型得到了改进,比如 `Qwen2.5 VL`、`Qwen3` 和 `Deepseek V3`。其他模型的效果还不够理想。从 " +"0.9.0rc2 开始,Qwen 和 Deepseek 已经支持图模式,以获得更好的性能。此外,你还可以在 `vllm-ascend v0.7.3` " +"上安装 `mindie-turbo`,进一步加速推理。" + +#: ../../faqs.md:74 +msgid "8. How vllm-ascend work with vllm?" +msgstr "8. vllm-ascend 如何与 vllm 协同工作?" + +#: ../../faqs.md:75 +msgid "" +"vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is " +"the same as the version of vllm. For example, if you use vllm 0.7.3, you " +"should use vllm-ascend 0.7.3 as well. For main branch, we will make sure " +"`vllm-ascend` and `vllm` are compatible by each commit." +msgstr "" +"vllm-ascend 是 vllm 的一个插件。基本上,vllm-ascend 的版本与 vllm 的版本是相同的。例如,如果你使用 vllm " +"0.7.3,你也应该使用 vllm-ascend 0.7.3。对于主分支,我们会确保每次提交都让 `vllm-ascend` 和 `vllm` " +"保持兼容。" + +#: ../../faqs.md:77 +msgid "9. Does vllm-ascend support Prefill Disaggregation feature?" +msgstr "9. vllm-ascend 支持 Prefill Disaggregation 功能吗?" + +#: ../../faqs.md:79 +msgid "" +"Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND " +"support, We will make it stable and supported by vllm-ascend in the future." +msgstr "目前,V0引擎只支持1P1D。对于V1引擎或NPND的支持,我们将在未来使其稳定并由vllm-ascend支持。" + +#: ../../faqs.md:81 +msgid "10. Does vllm-ascend support quantization method?" +msgstr "10. vllm-ascend 支持量化方法吗?" + +#: ../../faqs.md:83 +msgid "" +"Currently, w8a8 quantization is already supported by vllm-ascend originally " +"on v0.8.4rc2 or higher, If you're using vllm 0.7.3 version, w8a8 " +"quantization is supporeted with the integration of vllm-ascend and mindie-" +"turbo, please use `pip install vllm-ascend[mindie-turbo]`." +msgstr "" +"目前,w8a8 量化已在 v0.8.4rc2 或更高版本的 vllm-ascend 中原生支持。如果你使用的是 vllm 0.7.3 版本,集成了 " +"vllm-ascend 和 mindie-turbo 后也支持 w8a8 量化,请使用 `pip install vllm-ascend[mindie-" +"turbo]`。" + +#: ../../faqs.md:85 +msgid "11. How to run w8a8 DeepSeek model?" +msgstr "11. 如何运行 w8a8 DeepSeek 模型?" + +#: ../../faqs.md:87 +msgid "" +"Please following the [inferencing tutorail](https://vllm-" +"ascend.readthedocs.io/en/latest/tutorials/multi_node.html) and replace model" +" to DeepSeek." +msgstr "" +"请按照[inferencing 教程](https://vllm-" +"ascend.readthedocs.io/en/latest/tutorials/multi_node.html)进行操作,并将模型更换为 " +"DeepSeek。" + +#: ../../faqs.md:89 +msgid "" +"12. There is no output in log when loading models using vllm-ascend, How to " +"solve it?" +msgstr "12. 使用 vllm-ascend 加载模型时日志没有输出,如何解决?" + +#: ../../faqs.md:91 +msgid "" +"If you're using vllm 0.7.3 version, this is a known progress bar display " +"issue in VLLM, which has been resolved in [this PR](https://github.com/vllm-" +"project/vllm/pull/12428), please cherry-pick it locally by yourself. " +"Otherwise, please fill up an issue." +msgstr "" +"如果你正在使用 vllm 0.7.3 版本,这是 VLLM 已知的进度条显示问题,已在 [此 PR](https://github.com/vllm-" +"project/vllm/pull/12428) 中解决,请自行在本地进行 cherry-pick。否则,请提交一个 issue。" + +#: ../../faqs.md:93 +msgid "13. How vllm-ascend is tested" +msgstr "13. 如何测试 vllm-ascend" + +#: ../../faqs.md:95 +msgid "" +"vllm-ascend is tested by functional test, performance test and accuracy " +"test." +msgstr "vllm-ascend 经过功能测试、性能测试和精度测试。" + +#: ../../faqs.md:97 +msgid "" +"**Functional test**: we added CI, includes portion of vllm's native unit " +"tests and vllm-ascend's own unit tests,on vllm-ascend's test, we test basic " +"functionality、popular models availability and [supported " +"features](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)" +" via e2e test" +msgstr "" +"**功能测试**:我们添加了CI,包含了vllm原生单元测试的一部分以及vllm-ascend自己的单元测试。在vllm-" +"ascend的测试中,我们通过e2e测试验证了基本功能、主流模型可用性和[支持的特性](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)。" + +#: ../../faqs.md:99 +msgid "" +"**Performance test**: we provide [benchmark](https://github.com/vllm-" +"project/vllm-ascend/tree/main/benchmarks) tools for end-to-end performance " +"benchmark which can easily to re-route locally, we'll publish a perf website" +" to show the performance test results for each pull request" +msgstr "" +"**性能测试**:我们提供了用于端到端性能基准测试的[基准测试](https://github.com/vllm-project/vllm-" +"ascend/tree/main/benchmarks)工具,可以方便地在本地重新运行。我们将发布一个性能网站,用于展示每个拉取请求的性能测试结果。" + +#: ../../faqs.md:101 +msgid "" +"**Accuracy test**: we're working on adding accuracy test to CI as well." +msgstr "**准确性测试**:我们也在努力将准确性测试添加到CI中。" + +#: ../../faqs.md:103 +msgid "" +"Finnall, for each release, we'll publish the performance test and accuracy " +"test report in the future." +msgstr "最后,未来每个版本发布时,我们都会公开性能测试和准确性测试报告。" + +#: ../../faqs.md:105 +msgid "14. How to fix the error \"InvalidVersion\" when using vllm-ascend?" +msgstr "14. 使用 vllm-ascend 时如何解决 “InvalidVersion” 错误?" + +#: ../../faqs.md:106 +msgid "" +"It's usually because you have installed an dev/editable version of vLLM " +"package. In this case, we provide the env variable `VLLM_VERSION` to let " +"users specify the version of vLLM package to use. Please set the env " +"variable `VLLM_VERSION` to the version of vLLM package you have installed. " +"The format of `VLLM_VERSION` should be `X.Y.Z`." +msgstr "" +"这通常是因为你安装了开发版或可编辑版本的 vLLM 包。在这种情况下,我们提供了环境变量 `VLLM_VERSION`,以便用户指定要使用的 vLLM " +"包版本。请将环境变量 `VLLM_VERSION` 设置为你已安装的 vLLM 包的版本。`VLLM_VERSION` 的格式应为 `X.Y.Z`。" + +#: ../../faqs.md:108 +msgid "15. How to handle Out Of Memory?" +msgstr "15. 如何处理内存溢出?" + +#: ../../faqs.md:109 +msgid "" +"OOM errors typically occur when the model exceeds the memory capacity of a " +"single NPU. For general guidance, you can refer to [vLLM's OOM " +"troubleshooting " +"documentation](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-" +"of-memory)." +msgstr "" +"当模型超出单个 NPU 的内存容量时,通常会发生 OOM(内存溢出)错误。一般性的指导可以参考 [vLLM 的 OOM " +"故障排除文档](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-" +"of-memory)。" + +#: ../../faqs.md:111 +msgid "" +"In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, " +"dynamic memory allocation/deallocation during inference can exacerbate " +"memory fragmentation, leading to OOM. To address this:" +msgstr "" +"在 NPU 的 HBM(高带宽内存)容量有限的场景下,推理过程中动态内存分配和释放会加剧内存碎片,从而导致 OOM(内存溢出)。为了解决这个问题:" + +#: ../../faqs.md:113 +msgid "" +"**Adjust `--gpu-memory-utilization`**: If unspecified, will use the default " +"value of `0.9`. You can decrease this param to reserve more memory to reduce" +" fragmentation risks. See more note in: [vLLM - Inference and Serving - " +"Engine " +"Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-" +"_engine_args_parser-cacheconfig)." +msgstr "" +"**调整 `--gpu-memory-utilization`**:如果未指定,将使用默认值 " +"`0.9`。你可以降低此参数来预留更多内存,从而降低内存碎片风险。参见更多说明:[vLLM - 推理与服务 - " +"引擎参数](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-" +"_engine_args_parser-cacheconfig)。" + +#: ../../faqs.md:115 +msgid "" +"**Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to " +"optimize NPU memory management. For example, you can `export " +"PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory " +"feature to mitigate memory fragmentation caused by frequent dynamic memory " +"size adjustments during runtime, see more note in: " +"[PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)." +msgstr "" +"**配置 `PYTORCH_NPU_ALLOC_CONF`**:设置此环境变量以优化NPU内存管理。例如,你可以通过 `export " +"PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` " +"来启用虚拟内存功能,以缓解运行时频繁动态调整内存大小导致的内存碎片问题,更多说明参见:[PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)。" + +#: ../../faqs.md:117 +msgid "16. Failed to enable NPU graph mode when running DeepSeek?" +msgstr "16. 运行 DeepSeek 时无法启用 NPU 图模式?" + +#: ../../faqs.md:118 +#, python-brace-format +msgid "" +"You may encounter the following error if running DeepSeek with NPU graph " +"mode enabled. The allowed number of queries per kv when enabling both MLA " +"and Graph mode only support {32, 64, 128}, **Thus this is not supported for " +"DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode " +"support on DeepSeek-V2-Lite will be done in the future." +msgstr "" +"如果在启用NPU图模式(Graph " +"mode)运行DeepSeek时,您可能会遇到以下错误。当同时启用MLA和图模式时,每个kv允许的查询数只支持{32, 64, " +"128},**因此这不支持DeepSeek-V2-Lite**,因为它只有16个注意力头。未来会增加对DeepSeek-V2-Lite在NPU图模式下的支持。" + +#: ../../faqs.md:120 +#, python-brace-format +msgid "" +"And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the " +"tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}." +msgstr "" +"如果你正在使用 DeepSeek-V3 或 DeepSeek-R1,请确保在张量并行切分后,num_heads / num_kv_heads 的值为 " +"{32, 64, 128} 中的一个。" + +#: ../../faqs.md:127 +msgid "" +"17. Failed to reinstall vllm-ascend from source after uninstalling vllm-" +"ascend?" +msgstr "17. 卸载 vllm-ascend 后无法从源码重新安装 vllm-ascend?" + +#: ../../faqs.md:128 +msgid "" +"You may encounter the problem of C compilation failure when reinstalling " +"vllm-ascend from source using pip. If the installation fails, it is " +"recommended to use `python setup.py install` to install, or use `python " +"setup.py clean` to clear the cache." +msgstr "" +"当你使用 pip 从源码重新安装 vllm-ascend 时,可能会遇到 C 编译失败的问题。如果安装失败,建议使用 `python setup.py " +"install` 进行安装,或者使用 `python setup.py clean` 清除缓存。" + +#: ../../faqs.md:130 +msgid "18. How to generate determinitic results when using vllm-ascend?" +msgstr "18. 使用 vllm-ascend 时如何生成确定性结果?" + +#: ../../faqs.md:131 +msgid "There are several factors that affect output certainty:" +msgstr "有几个因素会影响输出的确定性:" + +#: ../../faqs.md:133 +msgid "" +"Sampler Method: using **Greedy sample** by setting `temperature=0` in " +"`SamplingParams`, e.g.:" +msgstr "" +"采样方法:通过在 `SamplingParams` 中设置 `temperature=0` 来使用 **贪婪采样(Greedy " +"sample)**,例如:" + +#: ../../faqs.md:158 +msgid "Set the following enveriments parameters:" +msgstr "设置以下环境参数:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/index.po new file mode 100644 index 0000000..2683485 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po @@ -0,0 +1,79 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: 2025-07-18 10:05+0800\n" +"Last-Translator: \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" +"X-Generator: Poedit 3.5\n" + +#: ../../index.md:33 +msgid "Getting Started" +msgstr "快速开始" + +#: ../../index.md:43 +msgid "User Guide" +msgstr "用户指南" + +#: ../../index.md:53 +msgid "Developer Guide" +msgstr "开发者指南" + +#: ../../index.md:64 +msgid "Community" +msgstr "社区" + +#: ../../index.md:1 +msgid "Welcome to vLLM Ascend Plugin" +msgstr "欢迎使用 vLLM Ascend 插件" + +#: ../../index.md:3 +msgid "vLLM" +msgstr "vLLM" + +#: ../../index.md:24 +msgid "" +"vLLM Ascend plugin (vllm-ascend) is a community maintained hardware plugin " +"for running vLLM on the Ascend NPU." +msgstr "" +"vLLM Ascend 插件(vllm-ascend)是一个由社区维护的硬件插件,用于在 Ascend " +"NPU 上运行 vLLM。" + +#: ../../index.md:26 +msgid "" +"This plugin is the recommended approach for supporting the Ascend backend " +"within the vLLM community. It adheres to the principles outlined in the " +"[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/" +"issues/11162), providing a hardware-pluggable interface that decouples the " +"integration of the Ascend NPU with vLLM." +msgstr "" +"该插件是 vLLM 社区推荐用于支持 Ascend 后端的方法。它遵循 [[RFC]: Hardware " +"pluggable](https://github.com/vllm-project/vllm/issues/11162) 中提出的原" +"则,提供了一个硬件可插拔接口,实现了 Ascend NPU 与 vLLM 集成的解耦。" + +#: ../../index.md:28 +msgid "" +"By using vLLM Ascend plugin, popular open-source models, including " +"Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run " +"seamlessly on the Ascend NPU." +msgstr "" +"通过使用 vLLM Ascend 插件,流行的开源模型,包括 Transformer 类、混合专家、" +"嵌入式、多模态大模型等,都可以在 Ascend NPU 上无缝运行。" + +#: ../../index.md:30 +msgid "Documentation" +msgstr "文档" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/installation.po b/docs/source/locale/zh_CN/LC_MESSAGES/installation.po new file mode 100644 index 0000000..5ed464b --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/installation.po @@ -0,0 +1,293 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: 2025-07-18 10:09+0800\n" +"Last-Translator: \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" +"X-Generator: Poedit 3.5\n" + +#: ../../installation.md:1 +msgid "Installation" +msgstr "安装" + +#: ../../installation.md:3 +msgid "This document describes how to install vllm-ascend manually." +msgstr "本文档介绍如何手动安装 vllm-ascend。" + +#: ../../installation.md:5 +msgid "Requirements" +msgstr "要求" + +#: ../../installation.md:7 +msgid "OS: Linux" +msgstr "操作系统:Linux" + +#: ../../installation.md:8 +msgid "Python: >= 3.9, < 3.12" +msgstr "Python:>= 3.9,< 3.12" + +#: ../../installation.md:9 +msgid "A hardware with Ascend NPU. It's usually the Atlas 800 A2 series." +msgstr "配备有昇腾NPU的硬件,通常是Atlas 800 A2系列。" + +#: ../../installation.md:10 +msgid "Software:" +msgstr "软件:" + +#: ../../installation.md +msgid "Software" +msgstr "软件" + +#: ../../installation.md +msgid "Supported version" +msgstr "支持的版本" + +#: ../../installation.md +msgid "Note" +msgstr "注释" + +#: ../../installation.md +msgid "CANN" +msgstr "CANN" + +#: ../../installation.md +msgid ">= 8.1.RC1" +msgstr ">= 8.1.RC1" + +#: ../../installation.md +msgid "Required for vllm-ascend and torch-npu" +msgstr "vllm-ascend 和 torch-npu 必需" + +#: ../../installation.md +msgid "torch-npu" +msgstr "torch-npu" + +#: ../../installation.md +msgid ">= 2.5.1.post1.dev20250619" +msgstr ">= 2.5.1.post1.dev20250619" + +#: ../../installation.md +msgid "" +"Required for vllm-ascend, No need to install manually, it will be auto " +"installed in below steps" +msgstr "vllm-ascend 必需,无需手动安装,后续步骤会自动安装。" + +#: ../../installation.md +msgid "torch" +msgstr "torch" + +#: ../../installation.md +msgid ">= 2.5.1" +msgstr ">= 2.5.1" + +#: ../../installation.md +msgid "Required for torch-npu and vllm" +msgstr "torch-npu 和 vllm 所需" + +#: ../../installation.md:18 +msgid "You have 2 way to install:" +msgstr "你有两种安装方式:" + +#: ../../installation.md:19 +msgid "" +"**Using pip**: first prepare env manually or via CANN image, then install " +"`vllm-ascend` using pip." +msgstr "" +"**使用 pip**:首先手动准备环境或通过 CANN 镜像准备环境,然后使用 pip 安装 " +"`vllm-ascend`。" + +#: ../../installation.md:20 +msgid "" +"**Using docker**: use the `vllm-ascend` pre-built docker image directly." +msgstr "**使用 docker**:直接使用 `vllm-ascend` 预构建的 docker 镜像。" + +#: ../../installation.md:22 +msgid "Configure a new environment" +msgstr "配置一个新环境" + +#: ../../installation.md:24 +msgid "" +"Before installing, you need to make sure firmware/driver and CANN are " +"installed correctly, refer to [link](https://ascend.github.io/docs/sources/" +"ascend/quick_install.html) for more details." +msgstr "" +"在安装之前,您需要确保固件/驱动和 CANN 已正确安装,更多详情请参考 [链接]" +"(https://ascend.github.io/docs/sources/ascend/quick_install.html)。" + +#: ../../installation.md:26 +msgid "Configure hardware environment" +msgstr "配置硬件环境" + +#: ../../installation.md:28 +msgid "" +"To verify that the Ascend NPU firmware and driver were correctly installed, " +"run:" +msgstr "要验证 Ascend NPU 固件和驱动程序是否正确安装,请运行:" + +#: ../../installation.md:34 +msgid "" +"Refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/" +"sources/ascend/quick_install.html) for more details." +msgstr "" +"更多详情请参考[Ascend环境搭建指南](https://ascend.github.io/docs/sources/" +"ascend/quick_install.html)。" + +#: ../../installation.md:36 +msgid "Configure software environment" +msgstr "配置软件环境" + +#: ../../installation.md +msgid "Before using pip" +msgstr "在使用 pip 之前" + +#: ../../installation.md:46 +msgid "" +"The easiest way to prepare your software environment is using CANN image " +"directly:" +msgstr "最简单的方式是直接使用 CANN 镜像来准备您的软件环境:" + +#: ../../installation.md +msgid "Click here to see \"Install CANN manually\"" +msgstr "点击此处查看“手动安装 CANN”" + +#: ../../installation.md:72 +msgid "You can also install CANN manually:" +msgstr "你也可以手动安装 CANN:" + +#: ../../installation.md +msgid "Before using docker" +msgstr "在使用 docker 之前" + +#: ../../installation.md:104 +msgid "" +"No more extra step if you are using `vllm-ascend` prebuilt docker image." +msgstr "如果你使用 `vllm-ascend` 预构建的 docker 镜像,就无需额外的步骤。" + +#: ../../installation.md:108 +msgid "Once it's done, you can start to set up `vllm` and `vllm-ascend`." +msgstr "完成后,你可以开始配置 `vllm` 和 `vllm-ascend`。" + +#: ../../installation.md:110 +msgid "Setup vllm and vllm-ascend" +msgstr "安装 vllm 和 vllm-ascend" + +#: ../../installation.md +msgid "Using pip" +msgstr "使用 pip" + +#: ../../installation.md:121 +msgid "First install system dependencies and config pip mirror:" +msgstr "首先安装系统依赖并配置 pip 镜像:" + +#: ../../installation.md:133 +msgid "" +"**[Optional]** Then config the extra-index of `pip` if you are working on a " +"x86 machine or using torch-npu dev version:" +msgstr "" +"**[可选]** 如果你在 x86 机器上工作或使用 torch-npu 开发版,请配置 `pip` 的额" +"外索引:" + +#: ../../installation.md:140 +msgid "" +"Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:" +msgstr "然后你可以从**预编译的 wheel 包**安装 `vllm` 和 `vllm-ascend`:" + +#: ../../installation.md +msgid "Click here to see \"Build from source code\"" +msgstr "点击此处查看“从源代码构建”" + +#: ../../installation.md:153 +msgid "or build from **source code**:" +msgstr "或者从**源代码**构建:" + +#: ../../installation.md:171 +msgid "" +"vllm-ascend will build custom ops by default. If you don't want to build " +"it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it." +msgstr "" +"vllm-ascend 默认会编译自定义算子。如果你不想编译它,可以设置环境变量 " +"`COMPILE_CUSTOM_KERNELS=0` 来禁用。" + +#: ../../installation.md:175 +msgid "" +"If you are building from v0.7.3-dev and intend to use sleep mode feature, " +"you should set `COMPILE_CUSTOM_KERNELS=1` manually. To build custom ops, " +"gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using " +"`pip install -e .` and encourage a torch-npu version conflict, please " +"install with `pip install --no-build-isolation -e .` to build on system " +"env. If you encounter other problems during compiling, it is probably " +"because unexpected compiler is being used, you may export `CXX_COMPILER` " +"and `C_COMPILER` in env to specify your g++ and gcc locations before " +"compiling." +msgstr "" +"如果你是从 v0.7.3-dev 版本开始构建,并且打算使用休眠模式功能,你需要手动设" +"置 `COMPILE_CUSTOM_KERNELS=1`。构建自定义算子时,要求 gcc/g++ 版本高于 8 且" +"支持 c++ 17 或更高标准。如果你正在使用 `pip install -e .` 并且出现了 torch-" +"npu 版本冲突,请使用 `pip install --no-build-isolation -e .` 在系统环境下进" +"行安装。如果在编译过程中遇到其它问题,可能是因为使用了非预期的编译器,你可以" +"在编译前通过环境变量导出 `CXX_COMPILER` 和 `C_COMPILER`,以指定你的 g++ 和 " +"gcc 路径。" + +#: ../../installation.md +msgid "Using docker" +msgstr "使用 docker" + +#: ../../installation.md:184 +msgid "You can just pull the **prebuilt image** and run it with bash." +msgstr "你可以直接拉取**预构建镜像**并用 bash 运行它。" + +#: ../../installation.md +msgid "Click here to see \"Build from Dockerfile\"" +msgstr "点击这里查看“从 Dockerfile 构建”" + +#: ../../installation.md:187 +msgid "or build IMAGE from **source code**:" +msgstr "或从**源代码**构建 IMAGE:" + +#: ../../installation.md:218 +msgid "" +"The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed " +"in `/vllm-workspace` and installed in [development mode](https://setuptools." +"pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to " +"help developer immediately take place changes without requiring a new " +"installation." +msgstr "" +"默认的工作目录是 `/workspace`,vLLM 和 vLLM Ascend 代码被放置在 `/vllm-" +"workspace`,并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/" +"development_mode.html)(`pip install -e`)安装,以便开发者能够即时生效更改," +"而无需重新安装。" + +#: ../../installation.md:222 +msgid "Extra information" +msgstr "额外信息" + +#: ../../installation.md:224 +msgid "Verify installation" +msgstr "验证安装" + +#: ../../installation.md:226 +msgid "Create and run a simple inference test. The `example.py` can be like:" +msgstr "创建并运行一个简单的推理测试。`example.py` 可以如下:" + +#: ../../installation.md:251 +msgid "Then run:" +msgstr "然后运行:" + +#: ../../installation.md:259 +msgid "The output will be like:" +msgstr "输出将会像这样:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po b/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po new file mode 100644 index 0000000..ebe66fd --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po @@ -0,0 +1,149 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: 2025-07-18 10:09+0800\n" +"Last-Translator: \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" +"X-Generator: Poedit 3.5\n" + +#: ../../quick_start.md:1 +msgid "Quickstart" +msgstr "快速入门" + +#: ../../quick_start.md:3 +msgid "Prerequisites" +msgstr "先决条件" + +#: ../../quick_start.md:5 +msgid "Supported Devices" +msgstr "支持的设备" + +#: ../../quick_start.md:6 +msgid "" +"Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 " +"Box16, Atlas 300T A2)" +msgstr "" +"Atlas A2 训练系列(Atlas 800T A2,Atlas 900 A2 PoD,Atlas 200T A2 Box16," +"Atlas 300T A2)" + +#: ../../quick_start.md:7 +msgid "Atlas 800I A2 Inference series (Atlas 800I A2)" +msgstr "Atlas 800I A2 推理系列(Atlas 800I A2)" + +#: ../../quick_start.md:9 +msgid "Setup environment using container" +msgstr "使用容器设置环境" + +#: ../../quick_start.md +msgid "Ubuntu" +msgstr "Ubuntu" + +#: ../../quick_start.md +msgid "openEuler" +msgstr "openEuler" + +#: ../../quick_start.md:69 +msgid "" +"The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed " +"in `/vllm-workspace` and installed in [development mode](https://setuptools." +"pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to " +"help developer immediately take place changes without requiring a new " +"installation." +msgstr "" +"默认的工作目录是 `/workspace`,vLLM 和 vLLM Ascend 代码被放置在 `/vllm-" +"workspace`,并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/" +"development_mode.html)(`pip install -e`)安装,以便开发者能够即时生效更改," +"而无需重新安装。" + +#: ../../quick_start.md:71 +msgid "Usage" +msgstr "用法" + +#: ../../quick_start.md:73 +msgid "You can use Modelscope mirror to speed up download:" +msgstr "你可以使用 Modelscope 镜像来加速下载:" + +#: ../../quick_start.md:80 +msgid "There are two ways to start vLLM on Ascend NPU:" +msgstr "在昇腾 NPU 上启动 vLLM 有两种方式:" + +#: ../../quick_start.md +msgid "Offline Batched Inference" +msgstr "离线批量推理" + +#: ../../quick_start.md:86 +msgid "" +"With vLLM installed, you can start generating texts for list of input " +"prompts (i.e. offline batch inferencing)." +msgstr "" +"安装了 vLLM 后,您可以开始为一系列输入提示生成文本(即离线批量推理)。" + +#: ../../quick_start.md:88 +msgid "" +"Try to run below Python script directly or use `python3` shell to generate " +"texts:" +msgstr "" +"尝试直接运行下面的 Python 脚本,或者使用 `python3` 交互式命令行来生成文本:" + +#: ../../quick_start.md +msgid "OpenAI Completions API" +msgstr "OpenAI Completions API" + +#: ../../quick_start.md:114 +msgid "" +"vLLM can also be deployed as a server that implements the OpenAI API " +"protocol. Run the following command to start the vLLM server with the [Qwen/" +"Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) " +"model:" +msgstr "" +"vLLM 也可以作为实现 OpenAI API 协议的服务器进行部署。运行以下命令,使用 " +"[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-" +"Instruct) 模型启动 vLLM 服务器:" + +#: ../../quick_start.md:124 +msgid "If you see log as below:" +msgstr "如果你看到如下日志:" + +#: ../../quick_start.md:132 +msgid "Congratulations, you have successfully started the vLLM server!" +msgstr "恭喜,你已经成功启动了 vLLM 服务器!" + +#: ../../quick_start.md:134 +msgid "You can query the list the models:" +msgstr "你可以查询模型列表:" + +#: ../../quick_start.md:141 +msgid "You can also query the model with input prompts:" +msgstr "你也可以通过输入提示来查询模型:" + +#: ../../quick_start.md:155 +msgid "" +"vLLM is serving as background process, you can use `kill -2 $VLLM_PID` to " +"stop the background process gracefully, it's equal to `Ctrl-C` to stop " +"foreground vLLM process:" +msgstr "" +"vLLM 正作为后台进程运行,你可以使用 `kill -2 $VLLM_PID` 来优雅地停止后台进" +"程,这等同于使用 `Ctrl-C` 停止前台 vLLM 进程:" + +#: ../../quick_start.md:164 +msgid "You will see output as below:" +msgstr "你将会看到如下输出:" + +#: ../../quick_start.md:172 +msgid "Finally, you can exit container by using `ctrl-D`." +msgstr "最后,你可以通过按 `ctrl-D` 退出容器。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po new file mode 100644 index 0000000..49292c4 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po @@ -0,0 +1,29 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/index.md:3 +msgid "Deployment" +msgstr "部署" + +#: ../../tutorials/index.md:1 +msgid "Tutorials" +msgstr "教程" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po new file mode 100644 index 0000000..b7bdb99 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po @@ -0,0 +1,192 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/multi_node.md:1 +msgid "Multi-Node-DP (DeepSeek)" +msgstr "多节点分布式处理(DeepSeek)" + +#: ../../tutorials/multi_node.md:3 +msgid "Getting Start" +msgstr "快速开始" + +#: ../../tutorials/multi_node.md:4 +msgid "" +"vLLM-Ascend now supports Data Parallel (DP) deployment, enabling model " +"weights to be replicated across multiple NPUs or instances, each processing " +"independent batches of requests. This is particularly useful for scaling " +"throughput across devices while maintaining high resource utilization." +msgstr "" +"vLLM-Ascend 现在支持数据并行(DP)部署,可以在多个 NPU " +"或实例之间复制模型权重,每个实例处理独立的请求批次。这对于在保证高资源利用率的同时,实现跨设备的吞吐量扩展特别有用。" + +#: ../../tutorials/multi_node.md:6 +msgid "" +"Each DP rank is deployed as a separate “core engine” process which " +"communicates with front-end process(es) via ZMQ sockets. Data Parallel can " +"be combined with Tensor Parallel, in which case each DP engine owns a number" +" of per-NPU worker processes equal to the TP size." +msgstr "" +"每个 DP 进程作为一个单独的“核心引擎”进程部署,并通过 ZMQ 套接字与前端进程通信。数据并行可以与张量并行结合使用,此时每个 DP " +"引擎拥有数量等于 TP 大小的每 NPU 工作进程。" + +#: ../../tutorials/multi_node.md:8 +msgid "" +"For Mixture-of-Experts (MoE) models — especially advanced architectures like" +" DeepSeek that utilize Multi-head Latent Attention (MLA) — a hybrid " +"parallelism approach is recommended: - Use **Data Parallelism (DP)** for" +" attention layers, which are replicated across devices and handle separate " +"batches. - Use **Expert or Tensor Parallelism (EP/TP)** for expert " +"layers, which are sharded across devices to distribute the computation." +msgstr "" +"对于混合专家(Mixture-of-Experts, MoE)模型——尤其是像 DeepSeek 这样采用多头潜在注意力(Multi-head Latent Attention, MLA)的高级架构——推荐使用混合并行策略:\n" +" - 对于注意力层,使用 **数据并行(Data Parallelism, DP)**,这些层会在各设备间复刻,并处理不同的批次。\n" +" - 对于专家层,使用 **专家并行或张量并行(Expert or Tensor Parallelism, EP/TP)**,这些层会在设备间分片,从而分担计算。" + +#: ../../tutorials/multi_node.md:12 +msgid "" +"This division enables attention layers to be replicated across Data Parallel" +" (DP) ranks, enabling them to process different batches independently. " +"Meanwhile, expert layers are partitioned (sharded) across devices using " +"Expert or Tensor Parallelism(DP*TP), maximizing hardware utilization and " +"efficiency." +msgstr "" +"这种划分使得注意力层能够在数据并行(DP)组内复制,从而能够独立处理不同的批次。同时,专家层通过专家或张量并行(DP*TP)在设备间进行分区(切片),最大化硬件利用率和效率。" + +#: ../../tutorials/multi_node.md:14 +msgid "" +"In these cases the data parallel ranks are not completely independent, " +"forward passes must be aligned and expert layers across all ranks are " +"required to synchronize during every forward pass, even if there are fewer " +"requests to be processed than DP ranks." +msgstr "" +"在这些情况下,数据并行的各个 rank 不是完全独立的,前向传播必须对齐,并且所有 rank " +"上的专家层在每次前向传播时都需要同步,即使待处理的请求数量少于 DP rank 的数量。" + +#: ../../tutorials/multi_node.md:16 +msgid "" +"For MoE models, when any requests are in progress in any rank, we must " +"ensure that empty “dummy” forward passes are performed in all ranks which " +"don’t currently have any requests scheduled. This is handled via a separate " +"DP `Coordinator` process which communicates with all of the ranks, and a " +"collective operation performed every N steps to determine when all ranks " +"become idle and can be paused. When TP is used in conjunction with DP, " +"expert layers form an EP or TP group of size (DP x TP)." +msgstr "" +"对于 MoE 模型,当任何一个 rank 有请求正在进行时,必须确保所有当前没有请求的 rank 都执行空的“虚拟”前向传播。这是通过一个单独的 DP " +"`Coordinator` 协调器进程来实现的,该进程与所有 rank 通信,并且每隔 N 步执行一次集体操作,以判断所有 rank " +"是否都处于空闲状态并可以暂停。当 TP 与 DP 结合使用时,专家层会组成一个规模为(DP x TP)的 EP 或 TP 组。" + +#: ../../tutorials/multi_node.md:18 +msgid "Verify Multi-Node Communication Environment" +msgstr "验证多节点通信环境" + +#: ../../tutorials/multi_node.md:20 +msgid "Physical Layer Requirements:" +msgstr "物理层要求:" + +#: ../../tutorials/multi_node.md:22 +msgid "" +"The physical machines must be located on the same WLAN, with network " +"connectivity." +msgstr "物理机器必须位于同一个 WLAN 中,并且具有网络连接。" + +#: ../../tutorials/multi_node.md:23 +msgid "" +"All NPUs are connected with optical modules, and the connection status must " +"be normal." +msgstr "所有 NPU 都通过光模块连接,且连接状态必须正常。" + +#: ../../tutorials/multi_node.md:25 +msgid "Verification Process:" +msgstr "验证流程:" + +#: ../../tutorials/multi_node.md:27 +msgid "" +"Execute the following commands on each node in sequence. The results must " +"all be `success` and the status must be `UP`:" +msgstr "在每个节点上依次执行以下命令。所有结果必须为 `success` 且状态必须为 `UP`:" + +#: ../../tutorials/multi_node.md:44 +msgid "NPU Interconnect Verification:" +msgstr "NPU 互连验证:" + +#: ../../tutorials/multi_node.md:45 +msgid "1. Get NPU IP Addresses" +msgstr "1. 获取 NPU IP 地址" + +#: ../../tutorials/multi_node.md:50 +msgid "2. Cross-Node PING Test" +msgstr "2. 跨节点PING测试" + +#: ../../tutorials/multi_node.md:56 +msgid "Run with docker" +msgstr "用 docker 运行" + +#: ../../tutorials/multi_node.md:57 +msgid "" +"Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the " +"`deepseek-v3-w8a8` quantitative model across multi-node." +msgstr "假设你有两台 Atlas 800 A2(64G*8)节点,并且想要在多节点上部署 `deepseek-v3-w8a8` 量化模型。" + +#: ../../tutorials/multi_node.md:92 +msgid "" +"Before launch the inference server, ensure some environment variables are " +"set for multi node communication" +msgstr "在启动推理服务器之前,确保已经为多节点通信设置了一些环境变量。" + +#: ../../tutorials/multi_node.md:95 +msgid "Run the following scripts on two nodes respectively" +msgstr "分别在两台节点上运行以下脚本" + +#: ../../tutorials/multi_node.md:97 +msgid "**node0**" +msgstr "**节点0**" + +#: ../../tutorials/multi_node.md:137 +msgid "**node1**" +msgstr "**节点1**" + +#: ../../tutorials/multi_node.md:176 +msgid "" +"The Deployment view looks like: ![alt text](../assets/multi_node_dp.png)" +msgstr "部署视图如下所示:![替代文本](../assets/multi_node_dp.png)" + +#: ../../tutorials/multi_node.md:176 +msgid "alt text" +msgstr "替代文本" + +#: ../../tutorials/multi_node.md:179 +msgid "" +"Once your server is started, you can query the model with input prompts:" +msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#: ../../tutorials/multi_node.md:192 +msgid "Run benchmarks" +msgstr "运行基准测试" + +#: ../../tutorials/multi_node.md:193 +msgid "" +"For details please refer to [benchmark](https://github.com/vllm-" +"project/vllm-ascend/tree/main/benchmarks)" +msgstr "" +"详细信息请参阅 [benchmark](https://github.com/vllm-project/vllm-" +"ascend/tree/main/benchmarks)" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po new file mode 100644 index 0000000..512de82 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po @@ -0,0 +1,62 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/multi_npu.md:1 +msgid "Multi-NPU (QwQ 32B)" +msgstr "多-NPU(QwQ 32B)" + +#: ../../tutorials/multi_npu.md:3 +msgid "Run vllm-ascend on Multi-NPU" +msgstr "在多NPU上运行 vllm-ascend" + +#: ../../tutorials/multi_npu.md:5 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/multi_npu.md:30 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/multi_npu.md:40 +msgid "Online Inference on Multi-NPU" +msgstr "多NPU的在线推理" + +#: ../../tutorials/multi_npu.md:42 +msgid "Run the following script to start the vLLM server on Multi-NPU:" +msgstr "运行以下脚本,在多NPU上启动 vLLM 服务器:" + +#: ../../tutorials/multi_npu.md:48 +msgid "" +"Once your server is started, you can query the model with input prompts" +msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#: ../../tutorials/multi_npu.md:63 +msgid "Offline Inference on Multi-NPU" +msgstr "多NPU离线推理" + +#: ../../tutorials/multi_npu.md:65 +msgid "Run the following script to execute offline inference on multi-NPU:" +msgstr "运行以下脚本以在多NPU上执行离线推理:" + +#: ../../tutorials/multi_npu.md:102 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po new file mode 100644 index 0000000..8718f07 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po @@ -0,0 +1,86 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/multi_npu_moge.md:1 +msgid "Multi-NPU (Pangu Pro MoE)" +msgstr "多NPU(Pangu Pro MoE)" + +#: ../../tutorials/multi_npu_moge.md:3 +msgid "Run vllm-ascend on Multi-NPU" +msgstr "在多NPU上运行 vllm-ascend" + +#: ../../tutorials/multi_npu_moge.md:5 +msgid "Run container:" +msgstr "运行容器:" + +#: ../../tutorials/multi_npu_moge.md:30 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/multi_npu_moge.md:37 +msgid "Download the model:" +msgstr "下载该模型:" + +#: ../../tutorials/multi_npu_moge.md:44 +msgid "Online Inference on Multi-NPU" +msgstr "多NPU上的在线推理" + +#: ../../tutorials/multi_npu_moge.md:46 +msgid "Run the following script to start the vLLM server on Multi-NPU:" +msgstr "运行以下脚本,在多NPU上启动 vLLM 服务器:" + +#: ../../tutorials/multi_npu_moge.md:55 +msgid "" +"Once your server is started, you can query the model with input prompts:" +msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#: ../../tutorials/multi_npu_moge.md +msgid "v1/completions" +msgstr "v1/补全" + +#: ../../tutorials/multi_npu_moge.md +msgid "v1/chat/completions" +msgstr "v1/chat/completions" + +#: ../../tutorials/multi_npu_moge.md:96 +msgid "If you run this successfully, you can see the info shown below:" +msgstr "如果你成功运行这个,你可以看到如下所示的信息:" + +#: ../../tutorials/multi_npu_moge.md:102 +msgid "Offline Inference on Multi-NPU" +msgstr "多NPU离线推理" + +#: ../../tutorials/multi_npu_moge.md:104 +msgid "Run the following script to execute offline inference on multi-NPU:" +msgstr "运行以下脚本以在多NPU上执行离线推理:" + +#: ../../tutorials/multi_npu_moge.md +msgid "Graph Mode" +msgstr "图模式" + +#: ../../tutorials/multi_npu_moge.md +msgid "Eager Mode" +msgstr "即时模式" + +#: ../../tutorials/multi_npu_moge.md:230 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po new file mode 100644 index 0000000..6307435 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_quantization.po @@ -0,0 +1,82 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/multi_npu_quantization.md:1 +msgid "Multi-NPU (QwQ 32B W8A8)" +msgstr "多NPU(QwQ 32B W8A8)" + +#: ../../tutorials/multi_npu_quantization.md:3 +msgid "Run docker container" +msgstr "运行 docker 容器" + +#: ../../tutorials/multi_npu_quantization.md:5 +msgid "w8a8 quantization feature is supported by v0.8.4rc2 or higher" +msgstr "w8a8 量化功能由 v0.8.4rc2 或更高版本支持" + +#: ../../tutorials/multi_npu_quantization.md:31 +msgid "Install modelslim and convert model" +msgstr "安装 modelslim 并转换模型" + +#: ../../tutorials/multi_npu_quantization.md:33 +msgid "" +"You can choose to convert the model yourself or use the quantized model we " +"uploaded, see https://www.modelscope.cn/models/vllm-ascend/QwQ-32B-W8A8" +msgstr "" +"你可以选择自己转换模型,或者使用我们上传的量化模型,详见 https://www.modelscope.cn/models/vllm-" +"ascend/QwQ-32B-W8A8" + +#: ../../tutorials/multi_npu_quantization.md:56 +msgid "Verify the quantized model" +msgstr "验证量化模型" + +#: ../../tutorials/multi_npu_quantization.md:57 +msgid "The converted model files looks like:" +msgstr "转换后的模型文件如下所示:" + +#: ../../tutorials/multi_npu_quantization.md:70 +msgid "" +"Run the following script to start the vLLM server with quantized model:" +msgstr "运行以下脚本以启动带有量化模型的 vLLM 服务器:" + +#: ../../tutorials/multi_npu_quantization.md:73 +msgid "" +"The value \"ascend\" for \"--quantization\" argument will be supported after" +" [a specific PR](https://github.com/vllm-project/vllm-ascend/pull/877) is " +"merged and released, you can cherry-pick this commit for now." +msgstr "" +"在 [特定的PR](https://github.com/vllm-project/vllm-ascend/pull/877) 合并并发布后, \"--" +"quantization\" 参数将支持值 \"ascend\",你也可以现在手动挑选该提交。" + +#: ../../tutorials/multi_npu_quantization.md:79 +msgid "" +"Once your server is started, you can query the model with input prompts" +msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#: ../../tutorials/multi_npu_quantization.md:93 +msgid "" +"Run the following script to execute offline inference on multi-NPU with " +"quantized model:" +msgstr "运行以下脚本,在多NPU上使用量化模型执行离线推理:" + +#: ../../tutorials/multi_npu_quantization.md:96 +msgid "To enable quantization for ascend, quantization method must be \"ascend\"" +msgstr "要在ascend上启用量化,量化方法必须为“ascend”。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po new file mode 100644 index 0000000..7133893 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_qwen3_moe.po @@ -0,0 +1,71 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/multi_npu_qwen3_moe.md:1 +msgid "Multi-NPU (Qwen3-30B-A3B)" +msgstr "多NPU(Qwen3-30B-A3B)" + +#: ../../tutorials/multi_npu_qwen3_moe.md:3 +msgid "Run vllm-ascend on Multi-NPU with Qwen3 MoE" +msgstr "在多NPU上运行带有Qwen3 MoE的vllm-ascend" + +#: ../../tutorials/multi_npu_qwen3_moe.md:5 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/multi_npu_qwen3_moe.md:30 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/multi_npu_qwen3_moe.md:40 +msgid "Online Inference on Multi-NPU" +msgstr "多NPU的在线推理" + +#: ../../tutorials/multi_npu_qwen3_moe.md:42 +msgid "Run the following script to start the vLLM server on Multi-NPU:" +msgstr "运行以下脚本以在多NPU上启动 vLLM 服务器:" + +#: ../../tutorials/multi_npu_qwen3_moe.md:44 +msgid "" +"For an Atlas A2 with 64GB of NPU card memory, tensor-parallel-size should be" +" at least 2, and for 32GB of memory, tensor-parallel-size should be at least" +" 4." +msgstr "" +"对于拥有64GB NPU卡内存的Atlas A2,tensor-parallel-size 至少应为2;对于32GB内存的NPU卡,tensor-" +"parallel-size 至少应为4。" + +#: ../../tutorials/multi_npu_qwen3_moe.md:50 +msgid "" +"Once your server is started, you can query the model with input prompts" +msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#: ../../tutorials/multi_npu_qwen3_moe.md:65 +msgid "Offline Inference on Multi-NPU" +msgstr "多NPU离线推理" + +#: ../../tutorials/multi_npu_qwen3_moe.md:67 +msgid "Run the following script to execute offline inference on multi-NPU:" +msgstr "运行以下脚本以在多NPU上执行离线推理:" + +#: ../../tutorials/multi_npu_qwen3_moe.md:104 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po new file mode 100644 index 0000000..530065d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_node_300i.po @@ -0,0 +1,110 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_node_300i.md:1 +msgid "Single Node (Atlas 300I series)" +msgstr "单节点(Atlas 300I 系列)" + +#: ../../tutorials/single_node_300i.md:4 +msgid "" +"This Atlas 300I series is currently experimental. In future versions, there " +"may be behavioral changes around model coverage, performance improvement." +msgstr "Atlas 300I 系列目前处于实验阶段。在未来的版本中,模型覆盖范围和性能提升方面可能会有行为上的变化。" + +#: ../../tutorials/single_node_300i.md:7 +msgid "Run vLLM on Altlas 300I series" +msgstr "在 Altlas 300I 系列上运行 vLLM" + +#: ../../tutorials/single_node_300i.md:9 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_node_300i.md:38 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_node_300i.md:48 +msgid "Online Inference on NPU" +msgstr "在NPU上进行在线推理" + +#: ../../tutorials/single_node_300i.md:50 +msgid "" +"Run the following script to start the vLLM server on NPU(Qwen3-0.6B:1 card, " +"Qwen2.5-7B-Instruct:2 cards, Pangu-Pro-MoE-72B: 8 cards):" +msgstr "" +"运行以下脚本,在 NPU 上启动 vLLM 服务器(Qwen3-0.6B:1 张卡,Qwen2.5-7B-Instruct:2 张卡,Pangu-" +"Pro-MoE-72B:8 张卡):" + +#: ../../tutorials/single_node_300i.md +msgid "Qwen3-0.6B" +msgstr "Qwen3-0.6B" + +#: ../../tutorials/single_node_300i.md:59 +#: ../../tutorials/single_node_300i.md:89 +#: ../../tutorials/single_node_300i.md:126 +msgid "Run the following command to start the vLLM server:" +msgstr "运行以下命令以启动 vLLM 服务器:" + +#: ../../tutorials/single_node_300i.md:70 +#: ../../tutorials/single_node_300i.md:100 +#: ../../tutorials/single_node_300i.md:140 +msgid "" +"Once your server is started, you can query the model with input prompts" +msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#: ../../tutorials/single_node_300i.md +msgid "Qwen/Qwen2.5-7B-Instruct" +msgstr "Qwen/Qwen2.5-7B-Instruct" + +#: ../../tutorials/single_node_300i.md +msgid "Pangu-Pro-MoE-72B" +msgstr "Pangu-Pro-MoE-72B" + +#: ../../tutorials/single_node_300i.md:119 +#: ../../tutorials/single_node_300i.md:257 +msgid "Download the model:" +msgstr "下载该模型:" + +#: ../../tutorials/single_node_300i.md:157 +msgid "If you run this script successfully, you can see the results." +msgstr "如果你成功运行此脚本,你就可以看到结果。" + +#: ../../tutorials/single_node_300i.md:159 +msgid "Offline Inference" +msgstr "离线推理" + +#: ../../tutorials/single_node_300i.md:161 +msgid "" +"Run the following script (`example.py`) to execute offline inference on NPU:" +msgstr "运行以下脚本(`example.py`)以在 NPU 上执行离线推理:" + +#: ../../tutorials/single_node_300i.md +msgid "Qwen2.5-7B-Instruct" +msgstr "Qwen2.5-7B-指令版" + +#: ../../tutorials/single_node_300i.md:320 +msgid "Run script:" +msgstr "运行脚本:" + +#: ../../tutorials/single_node_300i.md:325 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po new file mode 100644 index 0000000..9b69577 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu.po @@ -0,0 +1,107 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu.md:1 +msgid "Single NPU (Qwen3 8B)" +msgstr "单个NPU(Qwen3 8B)" + +#: ../../tutorials/single_npu.md:3 +msgid "Run vllm-ascend on Single NPU" +msgstr "在单个 NPU 上运行 vllm-ascend" + +#: ../../tutorials/single_npu.md:5 +msgid "Offline Inference on Single NPU" +msgstr "在单个NPU上进行离线推理" + +#: ../../tutorials/single_npu.md:7 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_npu.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu.md:40 +msgid "" +"`max_split_size_mb` prevents the native allocator from splitting blocks " +"larger than this size (in MB). This can reduce fragmentation and may allow " +"some borderline workloads to complete without running out of memory. You can" +" find more details " +"[here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +msgstr "" +"`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +"为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#: ../../tutorials/single_npu.md:43 +msgid "Run the following script to execute offline inference on a single NPU:" +msgstr "运行以下脚本以在单个 NPU 上执行离线推理:" + +#: ../../tutorials/single_npu.md +msgid "Graph Mode" +msgstr "图模式" + +#: ../../tutorials/single_npu.md +msgid "Eager Mode" +msgstr "即时模式" + +#: ../../tutorials/single_npu.md:98 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#: ../../tutorials/single_npu.md:105 +msgid "Online Serving on Single NPU" +msgstr "单个 NPU 上的在线服务" + +#: ../../tutorials/single_npu.md:107 +msgid "Run docker container to start the vLLM server on a single NPU:" +msgstr "运行 docker 容器,在单个 NPU 上启动 vLLM 服务器:" + +#: ../../tutorials/single_npu.md:163 +msgid "" +"Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's" +" max seq len (32768) is larger than the maximum number of tokens that can be" +" stored in KV cache (26240). This will differ with different NPU series base" +" on the HBM size. Please modify the value according to a suitable value for " +"your NPU series." +msgstr "" +"添加 `--max_model_len` 选项,以避免出现 Qwen2.5-7B 模型的最大序列长度(32768)大于 KV 缓存能存储的最大 " +"token 数(26240)时的 ValueError。不同 NPU 系列由于 HBM 容量不同,该值也会有所不同。请根据您的 NPU " +"系列,修改为合适的数值。" + +#: ../../tutorials/single_npu.md:166 +msgid "If your service start successfully, you can see the info shown below:" +msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#: ../../tutorials/single_npu.md:174 +msgid "" +"Once your server is started, you can query the model with input prompts:" +msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#: ../../tutorials/single_npu.md:187 +msgid "" +"If you query the server successfully, you can see the info shown below " +"(client):" +msgstr "如果你成功查询了服务器,你可以看到如下所示的信息(客户端):" + +#: ../../tutorials/single_npu.md:193 +msgid "Logs of the vllm server:" +msgstr "vllm 服务器的日志:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po new file mode 100644 index 0000000..71bad64 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_audio.po @@ -0,0 +1,77 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu_audio.md:1 +msgid "Single NPU (Qwen2-Audio 7B)" +msgstr "单个 NPU(Qwen2-Audio 7B)" + +#: ../../tutorials/single_npu_audio.md:3 +msgid "Run vllm-ascend on Single NPU" +msgstr "在单个 NPU 上运行 vllm-ascend" + +#: ../../tutorials/single_npu_audio.md:5 +msgid "Offline Inference on Single NPU" +msgstr "在单个NPU上进行离线推理" + +#: ../../tutorials/single_npu_audio.md:7 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_npu_audio.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu_audio.md:40 +msgid "" +"`max_split_size_mb` prevents the native allocator from splitting blocks " +"larger than this size (in MB). This can reduce fragmentation and may allow " +"some borderline workloads to complete without running out of memory. You can" +" find more details " +"[here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +msgstr "" +"`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +"为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#: ../../tutorials/single_npu_audio.md:43 +msgid "Install packages required for audio processing:" +msgstr "安装音频处理所需的软件包:" + +#: ../../tutorials/single_npu_audio.md:50 +msgid "Run the following script to execute offline inference on a single NPU:" +msgstr "运行以下脚本以在单个 NPU 上执行离线推理:" + +#: ../../tutorials/single_npu_audio.md:114 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#: ../../tutorials/single_npu_audio.md:120 +msgid "Online Serving on Single NPU" +msgstr "单个 NPU 上的在线服务" + +#: ../../tutorials/single_npu_audio.md:122 +msgid "" +"Currently, vllm's OpenAI-compatible server doesn't support audio inputs, " +"find more details [here](https://github.com/vllm-" +"project/vllm/issues/19977)." +msgstr "" +"目前,vllm 的兼容 OpenAI 的服务器不支持音频输入,更多详情请查看[这里](https://github.com/vllm-" +"project/vllm/issues/19977)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po new file mode 100644 index 0000000..0007af0 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_multimodal.po @@ -0,0 +1,99 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu_multimodal.md:1 +msgid "Single NPU (Qwen2.5-VL 7B)" +msgstr "单个NPU(Qwen2.5-VL 7B)" + +#: ../../tutorials/single_npu_multimodal.md:3 +msgid "Run vllm-ascend on Single NPU" +msgstr "在单个 NPU 上运行 vllm-ascend" + +#: ../../tutorials/single_npu_multimodal.md:5 +msgid "Offline Inference on Single NPU" +msgstr "在单个NPU上进行离线推理" + +#: ../../tutorials/single_npu_multimodal.md:7 +msgid "Run docker container:" +msgstr "运行 docker 容器:" + +#: ../../tutorials/single_npu_multimodal.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu_multimodal.md:40 +msgid "" +"`max_split_size_mb` prevents the native allocator from splitting blocks " +"larger than this size (in MB). This can reduce fragmentation and may allow " +"some borderline workloads to complete without running out of memory. You can" +" find more details " +"[here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)." +msgstr "" +"`max_split_size_mb` 防止本地分配器拆分超过此大小(以 MB " +"为单位)的内存块。这可以减少内存碎片,并且可能让一些边缘情况下的工作负载顺利完成而不会耗尽内存。你可以在[这里](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html)找到更多详细信息。" + +#: ../../tutorials/single_npu_multimodal.md:43 +msgid "Run the following script to execute offline inference on a single NPU:" +msgstr "运行以下脚本以在单个 NPU 上执行离线推理:" + +#: ../../tutorials/single_npu_multimodal.md:109 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" + +#: ../../tutorials/single_npu_multimodal.md:121 +msgid "Online Serving on Single NPU" +msgstr "单个 NPU 上的在线服务" + +#: ../../tutorials/single_npu_multimodal.md:123 +msgid "Run docker container to start the vLLM server on a single NPU:" +msgstr "运行 docker 容器,在单个 NPU 上启动 vLLM 服务器:" + +#: ../../tutorials/single_npu_multimodal.md:154 +msgid "" +"Add `--max_model_len` option to avoid ValueError that the " +"Qwen2.5-VL-7B-Instruct model's max seq len (128000) is larger than the " +"maximum number of tokens that can be stored in KV cache. This will differ " +"with different NPU series base on the HBM size. Please modify the value " +"according to a suitable value for your NPU series." +msgstr "" +"新增 `--max_model_len` 选项,以避免出现 ValueError,即 Qwen2.5-VL-7B-Instruct " +"模型的最大序列长度(128000)大于 KV 缓存可存储的最大 token 数。该数值会根据不同 NPU 系列的 HBM 大小而不同。请根据你的 NPU" +" 系列,将该值设置为合适的数值。" + +#: ../../tutorials/single_npu_multimodal.md:157 +msgid "If your service start successfully, you can see the info shown below:" +msgstr "如果你的服务启动成功,你会看到如下所示的信息:" + +#: ../../tutorials/single_npu_multimodal.md:165 +msgid "" +"Once your server is started, you can query the model with input prompts:" +msgstr "一旦你的服务器启动,你可以通过输入提示词来查询模型:" + +#: ../../tutorials/single_npu_multimodal.md:182 +msgid "" +"If you query the server successfully, you can see the info shown below " +"(client):" +msgstr "如果你成功查询了服务器,你可以看到如下所示的信息(客户端):" + +#: ../../tutorials/single_npu_multimodal.md:188 +msgid "Logs of the vllm server:" +msgstr "vllm 服务器的日志:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po new file mode 100644 index 0000000..4eb349a --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/single_npu_qwen3_embedding.po @@ -0,0 +1,70 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../tutorials/single_npu_qwen3_embedding.md:1 +msgid "Single NPU (Qwen3-Embedding-8B)" +msgstr "单个NPU(Qwen3-Embedding-8B)" + +#: ../../tutorials/single_npu_qwen3_embedding.md:3 +msgid "" +"The Qwen3 Embedding model series is the latest proprietary model of the Qwen" +" family, specifically designed for text embedding and ranking tasks. " +"Building upon the dense foundational models of the Qwen3 series, it provides" +" a comprehensive range of text embeddings and reranking models in various " +"sizes (0.6B, 4B, and 8B). This guide describes how to run the model with " +"vLLM Ascend. Note that only 0.9.2rc1 and higher versions of vLLM Ascend " +"support the model." +msgstr "" +"Qwen3 Embedding 模型系列是 Qwen 家族最新的专有模型,专为文本嵌入和排序任务设计。在 Qwen3 " +"系列的密集基础模型之上,它提供了多种尺寸(0.6B、4B 和 8B)的文本嵌入与重排序模型。本指南介绍如何使用 vLLM Ascend " +"运行该模型。请注意,只有 vLLM Ascend 0.9.2rc1 及更高版本才支持该模型。" + +#: ../../tutorials/single_npu_qwen3_embedding.md:5 +msgid "Run docker container" +msgstr "运行 docker 容器" + +#: ../../tutorials/single_npu_qwen3_embedding.md:7 +msgid "" +"Take Qwen3-Embedding-8B model as an example, first run the docker container " +"with the following command:" +msgstr "以 Qwen3-Embedding-8B 模型为例,首先使用以下命令运行 docker 容器:" + +#: ../../tutorials/single_npu_qwen3_embedding.md:29 +msgid "Setup environment variables:" +msgstr "设置环境变量:" + +#: ../../tutorials/single_npu_qwen3_embedding.md:39 +msgid "Online Inference" +msgstr "在线推理" + +#: ../../tutorials/single_npu_qwen3_embedding.md:45 +msgid "" +"Once your server is started, you can query the model with input prompts" +msgstr "一旦服务器启动,就可以通过输入提示词来查询模型。" + +#: ../../tutorials/single_npu_qwen3_embedding.md:56 +msgid "Offline Inference" +msgstr "离线推理" + +#: ../../tutorials/single_npu_qwen3_embedding.md:92 +msgid "If you run this script successfully, you can see the info shown below:" +msgstr "如果你成功运行此脚本,你可以看到如下所示的信息:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po new file mode 100644 index 0000000..54dacd6 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/additional_config.po @@ -0,0 +1,290 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/configuration/additional_config.md:1 +msgid "Additional Configuration" +msgstr "附加配置" + +#: ../../user_guide/configuration/additional_config.md:3 +msgid "" +"additional configuration is a mechanism provided by vLLM to allow plugins to" +" control inner behavior by their own. vLLM Ascend uses this mechanism to " +"make the project more flexible." +msgstr "额外配置是 vLLM 提供的一种机制,允许插件自行控制内部行为。vLLM Ascend 利用这种机制使项目更加灵活。" + +#: ../../user_guide/configuration/additional_config.md:5 +msgid "How to use" +msgstr "如何使用" + +#: ../../user_guide/configuration/additional_config.md:7 +msgid "" +"With either online mode or offline mode, users can use additional " +"configuration. Take Qwen3 as an example:" +msgstr "无论是在线模式还是离线模式,用户都可以使用额外的配置。以 Qwen3 为例:" + +#: ../../user_guide/configuration/additional_config.md:9 +msgid "**Online mode**:" +msgstr "**在线模式**:" + +#: ../../user_guide/configuration/additional_config.md:15 +msgid "**Offline mode**:" +msgstr "**离线模式**:" + +#: ../../user_guide/configuration/additional_config.md:23 +msgid "Configuration options" +msgstr "配置选项" + +#: ../../user_guide/configuration/additional_config.md:25 +msgid "" +"The following table lists the additional configuration options available in " +"vLLM Ascend:" +msgstr "下表列出了 vLLM Ascend 中可用的其他配置选项:" + +#: ../../user_guide/configuration/additional_config.md +msgid "Name" +msgstr "名称" + +#: ../../user_guide/configuration/additional_config.md +msgid "Type" +msgstr "类型" + +#: ../../user_guide/configuration/additional_config.md +msgid "Default" +msgstr "默认" + +#: ../../user_guide/configuration/additional_config.md +msgid "Description" +msgstr "描述" + +#: ../../user_guide/configuration/additional_config.md +msgid "`torchair_graph_config`" +msgstr "`torchair_graph_config`" + +#: ../../user_guide/configuration/additional_config.md +msgid "dict" +msgstr "dict" + +#: ../../user_guide/configuration/additional_config.md +#, python-brace-format +msgid "`{}`" +msgstr "`{}`" + +#: ../../user_guide/configuration/additional_config.md +msgid "The config options for torchair graph mode" +msgstr "torchair 图模式的配置选项" + +#: ../../user_guide/configuration/additional_config.md +msgid "`ascend_scheduler_config`" +msgstr "`ascend_scheduler_config`" + +#: ../../user_guide/configuration/additional_config.md +msgid "The config options for ascend scheduler" +msgstr "ascend 调度器的配置选项" + +#: ../../user_guide/configuration/additional_config.md +msgid "`expert_tensor_parallel_size`" +msgstr "`expert_tensor_parallel_size`" + +#: ../../user_guide/configuration/additional_config.md +msgid "str" +msgstr "str" + +#: ../../user_guide/configuration/additional_config.md +msgid "`0`" +msgstr "`0`" + +#: ../../user_guide/configuration/additional_config.md +msgid "Expert tensor parallel size the model to use." +msgstr "专家张量并行的模型大小设置。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`refresh`" +msgstr "`刷新`" + +#: ../../user_guide/configuration/additional_config.md +msgid "bool" +msgstr "bool" + +#: ../../user_guide/configuration/additional_config.md +msgid "`false`" +msgstr "`false`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"Whether to refresh global ascend config content. This value is usually used " +"by rlhf or ut/e2e test case." +msgstr "是否刷新全局 ascend 配置信息。此值通常由 rlhf 或 ut/e2e 测试用例使用。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`expert_map_path`" +msgstr "`expert_map_path`" + +#: ../../user_guide/configuration/additional_config.md +msgid "`None`" +msgstr "`None`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"When using expert load balancing for the MOE model, an expert map path needs" +" to be passed in." +msgstr "在为MOE模型使用专家负载均衡时,需要传入专家映射路径。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`chunked_prefill_for_mla`" +msgstr "`chunked_prefill_for_mla`" + +#: ../../user_guide/configuration/additional_config.md +msgid "`False`" +msgstr "`False`" + +#: ../../user_guide/configuration/additional_config.md +msgid "Whether to enable the fused operator-like chunked_prefill." +msgstr "是否启用类似算子融合的 chunked_prefill 功能。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`kv_cache_dtype`" +msgstr "`kv_cache_dtype`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"When using the kv cache quantization method, kv cache dtype needs to be set," +" currently only int8 is supported." +msgstr "当使用kv缓存量化方法时,需要设置kv缓存的数据类型,目前仅支持int8。" + +#: ../../user_guide/configuration/additional_config.md:37 +msgid "The details of each config option are as follows:" +msgstr "每个配置选项的详细信息如下:" + +#: ../../user_guide/configuration/additional_config.md:39 +msgid "**torchair_graph_config**" +msgstr "**torchair_graph_config**" + +#: ../../user_guide/configuration/additional_config.md +msgid "`enabled`" +msgstr "`启用`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"Whether to enable torchair graph mode. Currently only DeepSeek series models" +" and PanguProMoE are supported to use torchair graph mode" +msgstr "是否启用 torchair 图模式。目前仅支持 DeepSeek 系列模型和 PanguProMoE 使用 torchair 图模式。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`enable_multistream_mla`" +msgstr "`enable_multistream_mla`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"Whether to put vector ops of MLA to another stream. This option only takes " +"effects on models using MLA (e.g., DeepSeek)." +msgstr "是否将MLA的向量操作放到另一个流中。此选项仅对使用MLA的模型(例如,DeepSeek)有效。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`enable_multistream_moe`" +msgstr "`enable_multistream_moe`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"Whether to enable multistream shared expert. This option only takes effects " +"on DeepSeek moe models." +msgstr "是否启用多流共享专家功能。此选项仅对 DeepSeek MoE 模型生效。" + +#: ../../user_guide/configuration/additional_config.md +msgid "`enable_view_optimize`" +msgstr "`enable_view_optimize` (启用视图优化)" + +#: ../../user_guide/configuration/additional_config.md +msgid "`True`" +msgstr "`True`" + +#: ../../user_guide/configuration/additional_config.md +msgid "Whether to enable torchair view optimization" +msgstr "是否启用torchair视图优化" + +#: ../../user_guide/configuration/additional_config.md +msgid "`use_cached_graph`" +msgstr "`use_cached_graph`" + +#: ../../user_guide/configuration/additional_config.md +msgid "Whether to use cached graph" +msgstr "是否使用缓存的图" + +#: ../../user_guide/configuration/additional_config.md +msgid "`graph_batch_sizes`" +msgstr "`graph_batch_sizes`" + +#: ../../user_guide/configuration/additional_config.md +msgid "list[int]" +msgstr "list[int]" + +#: ../../user_guide/configuration/additional_config.md +msgid "`[]`" +msgstr "`[]`" + +#: ../../user_guide/configuration/additional_config.md +msgid "The batch size for torchair graph cache" +msgstr "torchair 图缓存的批量大小" + +#: ../../user_guide/configuration/additional_config.md +msgid "`graph_batch_sizes_init`" +msgstr "`graph_batch_sizes_init`" + +#: ../../user_guide/configuration/additional_config.md +msgid "Init graph batch size dynamically if `graph_batch_sizes` is empty" +msgstr "如果 `graph_batch_sizes` 为空,则动态初始化图批大小" + +#: ../../user_guide/configuration/additional_config.md +msgid "`enable_kv_nz`" +msgstr "`enable_kv_nz`" + +#: ../../user_guide/configuration/additional_config.md +msgid "" +"Whether to enable kvcache NZ layout. This option only takes effects on " +"models using MLA (e.g., DeepSeek)." +msgstr "是否启用 kvcache NZ 布局。此选项仅对使用 MLA 的模型(例如 DeepSeek)生效。" + +#: ../../user_guide/configuration/additional_config.md:52 +msgid "**ascend_scheduler_config**" +msgstr "**ascend_scheduler_config**" + +#: ../../user_guide/configuration/additional_config.md +msgid "Whether to enable ascend scheduler for V1 engine" +msgstr "是否为 V1 引擎启用 ascend 调度器" + +#: ../../user_guide/configuration/additional_config.md:58 +msgid "" +"ascend_scheduler_config also support the options from [vllm scheduler " +"config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig)." +" For example, you can add `enable_chunked_prefill: True` to " +"ascend_scheduler_config as well." +msgstr "" +"ascend_scheduler_config 也支持来自 [vllm scheduler " +"config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig)" +" 的选项。例如,你也可以在 ascend_scheduler_config 中添加 `enable_chunked_prefill: True`。" + +#: ../../user_guide/configuration/additional_config.md:60 +msgid "Example" +msgstr "示例" + +#: ../../user_guide/configuration/additional_config.md:62 +msgid "An example of additional configuration is as follows:" +msgstr "以下是额外配置的一个示例:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po new file mode 100644 index 0000000..769efa2 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/env_vars.po @@ -0,0 +1,28 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/configuration/env_vars.md:1 +msgid "Environment Variables" +msgstr "环境变量" + +#: ../../user_guide/configuration/env_vars.md:3 +msgid "" +"vllm-ascend uses the following environment variables to configure the " +"system:" +msgstr "vllm-ascend 使用以下环境变量来配置系统:" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po new file mode 100644 index 0000000..2c5f790 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/configuration/index.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/configuration/index.md:1 +#: ../../user_guide/configuration/index.md:5 +msgid "Configuration Guide" +msgstr "配置指南" + +#: ../../user_guide/configuration/index.md:3 +msgid "This section provides a detailed configuration guide of vLLM Ascend." +msgstr "本节提供了 vLLM Ascend 的详细配置指南。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po new file mode 100644 index 0000000..9680bdb --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/graph_mode.po @@ -0,0 +1,121 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/graph_mode.md:1 +msgid "Graph Mode Guide" +msgstr "图模式指南" + +#: ../../user_guide/feature_guide/graph_mode.md:4 +msgid "" +"This feature is currently experimental. In future versions, there may be " +"behavioral changes around configuration, coverage, performance improvement." +msgstr "此功能目前为实验性功能。在未来的版本中,配置、覆盖率和性能改进等方面的行为可能会有变化。" + +#: ../../user_guide/feature_guide/graph_mode.md:7 +msgid "" +"This guide provides instructions for using Ascend Graph Mode with vLLM " +"Ascend. Please note that graph mode is only available on V1 Engine. And only" +" Qwen, DeepSeek series models are well tested from 0.9.0rc1. We'll make it " +"stable and generalize in the next release." +msgstr "" +"本指南提供了在 vLLM Ascend 上使用 Ascend 图模式的操作说明。请注意,图模式仅在 V1 引擎上可用,并且从 0.9.0rc1 起,仅对" +" Qwen、DeepSeek 系列模型进行了充分测试。我们将在下一个版本中使其更加稳定和通用。" + +#: ../../user_guide/feature_guide/graph_mode.md:9 +msgid "Getting Started" +msgstr "快速入门" + +#: ../../user_guide/feature_guide/graph_mode.md:11 +msgid "" +"From v0.9.1rc1 with V1 Engine, vLLM Ascend will run models in graph mode by " +"default to keep the same behavior with vLLM. If you hit any issues, please " +"feel free to open an issue on GitHub and fallback to eager mode temporarily " +"by set `enforce_eager=True` when initializing the model." +msgstr "" +"从 v0.9.1rc1 版本起,使用 V1 引擎时,vLLM Ascend 默认将在图模式下运行模型,以保持与 vLLM " +"同样的行为。如果遇到任何问题,欢迎在 GitHub 上提交 issue,并在初始化模型时通过设置 `enforce_eager=True` 临时切换回 " +"eager 模式。" + +#: ../../user_guide/feature_guide/graph_mode.md:13 +msgid "There are two kinds for graph mode supported by vLLM Ascend:" +msgstr "vLLM Ascend 支持两种图模式:" + +#: ../../user_guide/feature_guide/graph_mode.md:14 +msgid "" +"**ACLGraph**: This is the default graph mode supported by vLLM Ascend. In " +"v0.9.1rc1, only Qwen series models are well tested." +msgstr "" +"**ACLGraph**:这是 vLLM Ascend 支持的默认图模式。在 v0.9.1rc1 版本中,只有 Qwen 系列模型得到了充分测试。" + +#: ../../user_guide/feature_guide/graph_mode.md:15 +msgid "" +"**TorchAirGraph**: This is the GE graph mode. In v0.9.1rc1, only DeepSeek " +"series models are supported." +msgstr "**TorchAirGraph**:这是GE图模式。在v0.9.1rc1版本中,仅支持DeepSeek系列模型。" + +#: ../../user_guide/feature_guide/graph_mode.md:17 +msgid "Using ACLGraph" +msgstr "使用 ACLGraph" + +#: ../../user_guide/feature_guide/graph_mode.md:18 +msgid "" +"ACLGraph is enabled by default. Take Qwen series models as an example, just " +"set to use V1 Engine is enough." +msgstr "ACLGraph 默认启用。以 Qwen 系列模型为例,只需设置为使用 V1 引擎即可。" + +#: ../../user_guide/feature_guide/graph_mode.md:20 +#: ../../user_guide/feature_guide/graph_mode.md:41 +#: ../../user_guide/feature_guide/graph_mode.md:64 +msgid "offline example:" +msgstr "离线示例:" + +#: ../../user_guide/feature_guide/graph_mode.md:31 +#: ../../user_guide/feature_guide/graph_mode.md:52 +#: ../../user_guide/feature_guide/graph_mode.md:74 +msgid "online example:" +msgstr "在线示例:" + +#: ../../user_guide/feature_guide/graph_mode.md:37 +msgid "Using TorchAirGraph" +msgstr "使用 TorchAirGraph" + +#: ../../user_guide/feature_guide/graph_mode.md:39 +msgid "" +"If you want to run DeepSeek series models with graph mode, you should use " +"[TorchAirGraph](https://www.hiascend.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html)." +" In this case, additional config is required." +msgstr "" +"如果你想通过图模式运行 DeepSeek 系列模型,你应该使用 " +"[TorchAirGraph](https://www.hiascend.com/document/detail/zh/Pytorch/700/modthirdparty/torchairuseguide/torchair_0002.html)。在这种情况下,需要额外的配置。" + +#: ../../user_guide/feature_guide/graph_mode.md:58 +msgid "" +"You can find more detail about additional config " +"[here](../configuration/additional_config.md)." +msgstr "你可以在[这里](../configuration/additional_config.md)找到关于附加配置的更多详细信息。" + +#: ../../user_guide/feature_guide/graph_mode.md:60 +msgid "Fallback to Eager Mode" +msgstr "回退到 Eager 模式" + +#: ../../user_guide/feature_guide/graph_mode.md:62 +msgid "" +"If both `ACLGraph` and `TorchAirGraph` fail to run, you should fallback to " +"eager mode." +msgstr "如果 `ACLGraph` 和 `TorchAirGraph` 都无法运行,你应该退回到 eager 模式。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po new file mode 100644 index 0000000..624c39f --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/index.po @@ -0,0 +1,30 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/index.md:1 +#: ../../user_guide/feature_guide/index.md:5 +msgid "Feature Guide" +msgstr "功能指南" + +#: ../../user_guide/feature_guide/index.md:3 +msgid "This section provides a detailed usage guide of vLLM Ascend features." +msgstr "本节提供了 vLLM Ascend 功能的详细使用指南。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po new file mode 100644 index 0000000..957ec17 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/lora.po @@ -0,0 +1,58 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/lora.md:1 +msgid "LoRA Adapters Guide" +msgstr "LoRA 适配器指南" + +#: ../../user_guide/feature_guide/lora.md:3 +msgid "" +"Like vLLM, vllm-ascend supports LoRA as well. The usage and more details can" +" be found in [vLLM official " +"document](https://docs.vllm.ai/en/latest/features/lora.html)." +msgstr "" +"与 vLLM 类似,vllm-ascend 也支持 LoRA。用法及更多详情可参见 [vLLM " +"官方文档](https://docs.vllm.ai/en/latest/features/lora.html)。" + +#: ../../user_guide/feature_guide/lora.md:5 +msgid "" +"You can also refer to " +"[this](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-" +"text-only-language-models) to find which models support LoRA in vLLM." +msgstr "" +"你也可以参考[这个链接](https://docs.vllm.ai/en/latest/models/supported_models.html#list-" +"of-text-only-language-models)来查找哪些模型在 vLLM 中支持 LoRA。" + +#: ../../user_guide/feature_guide/lora.md:7 +msgid "Tips" +msgstr "提示" + +#: ../../user_guide/feature_guide/lora.md:8 +msgid "" +"If you fail to run vllm-ascend with LoRA, you may follow [this " +"instruction](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html#fallback-" +"to-eager-mode) to disable graph mode and try again." +msgstr "" +"如果你在使用 LoRA 运行 vllm-ascend 时失败,可以按照[此说明](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html#fallback-" +"to-eager-mode)禁用图模式后再重试。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po new file mode 100644 index 0000000..54f524e --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/quantization.po @@ -0,0 +1,183 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/quantization.md:1 +msgid "Quantization Guide" +msgstr "量化指南" + +#: ../../user_guide/feature_guide/quantization.md:3 +msgid "" +"Model quantization is a technique that reduces the size and computational " +"requirements of a model by lowering the data precision of the weights and " +"activation values in the model, thereby saving the memory and improving the " +"inference speed." +msgstr "模型量化是一种通过降低模型中权重和激活值的数据精度,从而减少模型大小和计算需求的技术,这样可以节省内存并提高推理速度。" + +#: ../../user_guide/feature_guide/quantization.md:5 +msgid "" +"Since 0.9.0rc2 version, quantization feature is experimentally supported in " +"vLLM Ascend. Users can enable quantization feature by specifying " +"`--quantization ascend`. Currently, only Qwen, DeepSeek series models are " +"well tested. We’ll support more quantization algorithm and models in the " +"future." +msgstr "" +"自 0.9.0rc2 版本起,vLLM Ascend 实验性地支持量化特性。用户可以通过指定 `--quantization ascend` " +"启用量化功能。目前,只有 Qwen、DeepSeek 系列模型经过了充分测试。未来我们将支持更多的量化算法和模型。" + +#: ../../user_guide/feature_guide/quantization.md:7 +msgid "Install modelslim" +msgstr "安装 modelslim" + +#: ../../user_guide/feature_guide/quantization.md:9 +msgid "" +"To quantize a model, users should install " +"[ModelSlim](https://gitee.com/ascend/msit/blob/master/msmodelslim/README.md)" +" which is the Ascend compression and acceleration tool. It is an affinity-" +"based compression tool designed for acceleration, using compression as its " +"core technology and built upon the Ascend platform." +msgstr "" +"要对模型进行量化,用户应安装[ModelSlim](https://gitee.com/ascend/msit/blob/master/msmodelslim/README.md),这是昇腾的压缩与加速工具。它是一种基于亲和性的压缩工具,专为加速设计,以压缩为核心技术,并基于昇腾平台构建。" + +#: ../../user_guide/feature_guide/quantization.md:11 +msgid "" +"Currently, only the specific tag [modelslim-" +"VLLM-8.1.RC1.b020_001](https://gitee.com/ascend/msit/blob/modelslim-" +"VLLM-8.1.RC1.b020_001/msmodelslim/README.md) of modelslim works with vLLM " +"Ascend. Please do not install other version until modelslim master version " +"is available for vLLM Ascend in the future." +msgstr "" +"目前,只有 modelslim 的特定标签 [modelslim-" +"VLLM-8.1.RC1.b020_001](https://gitee.com/ascend/msit/blob/modelslim-" +"VLLM-8.1.RC1.b020_001/msmodelslim/README.md) 支持 vLLM Ascend。在未来 modelslim " +"的主版本支持 vLLM Ascend 之前,请不要安装其他版本。" + +#: ../../user_guide/feature_guide/quantization.md:13 +msgid "Install modelslim:" +msgstr "安装 modelslim:" + +#: ../../user_guide/feature_guide/quantization.md:21 +msgid "Quantize model" +msgstr "量化模型" + +#: ../../user_guide/feature_guide/quantization.md:23 +#, python-format +msgid "" +"Take [DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-" +"ai/DeepSeek-V2-Lite) as an example, you just need to download the model, and" +" then execute the convert command. The command is shown below. More info can" +" be found in modelslim doc [deepseek w8a8 dynamic quantization " +"docs](https://gitee.com/ascend/msit/blob/modelslim-" +"VLLM-8.1.RC1.b020_001/msmodelslim/example/DeepSeek/README.md#deepseek-v2-w8a8-dynamic%E9%87%8F%E5%8C%96)." +msgstr "" +"以 [DeepSeek-V2-Lite](https://modelscope.cn/models/deepseek-" +"ai/DeepSeek-V2-Lite) 为例,你只需要下载模型,然后执行转换命令。命令如下所示。更多信息可参考 modelslim 文档 " +"[deepseek w8a8 动态量化文档](https://gitee.com/ascend/msit/blob/modelslim-" +"VLLM-8.1.RC1.b020_001/msmodelslim/example/DeepSeek/README.md#deepseek-v2-w8a8-dynamic%E9%87%8F%E5%8C%96)。" + +#: ../../user_guide/feature_guide/quantization.md:32 +msgid "" +"You can also download the quantized model that we uploaded. Please note that" +" these weights should be used for test only. For example, " +"https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V2-Lite-W8A8" +msgstr "" +"你也可以下载我们上传的量化模型。请注意,这些权重仅应用于测试。例如:https://www.modelscope.cn/models/vllm-" +"ascend/DeepSeek-V2-Lite-W8A8" + +#: ../../user_guide/feature_guide/quantization.md:35 +msgid "Once convert action is done, there are two important files generated." +msgstr "转换操作完成后,会生成两个重要的文件。" + +#: ../../user_guide/feature_guide/quantization.md:37 +msgid "" +"[config.json](https://www.modelscope.cn/models/vllm-" +"ascend/DeepSeek-V2-Lite-W8A8/file/view/master/config.json?status=1). Please " +"make sure that there is no `quantization_config` field in it." +msgstr "" +"[config.json](https://www.modelscope.cn/models/vllm-" +"ascend/DeepSeek-V2-Lite-W8A8/file/view/master/config.json?status=1)。请确保其中没有 " +"`quantization_config` 字段。" + +#: ../../user_guide/feature_guide/quantization.md:39 +msgid "" +"[quant_model_description.json](https://www.modelscope.cn/models/vllm-" +"ascend/DeepSeek-V2-Lite-W8A8/file/view/master/quant_model_description.json?status=1)." +" All the converted weights info are recorded in this file." +msgstr "" +"[quant_model_description.json](https://www.modelscope.cn/models/vllm-" +"ascend/DeepSeek-V2-Lite-W8A8/file/view/master/quant_model_description.json?status=1)。所有被转换的权重信息都记录在该文件中。" + +#: ../../user_guide/feature_guide/quantization.md:41 +msgid "Here is the full converted model files:" +msgstr "以下是完整转换后的模型文件:" + +#: ../../user_guide/feature_guide/quantization.md:60 +msgid "Run the model" +msgstr "运行模型" + +#: ../../user_guide/feature_guide/quantization.md:62 +msgid "" +"Now, you can run the quantized models with vLLM Ascend. Here is the example " +"for online and offline inference." +msgstr "现在,你可以使用 vLLM Ascend 运行量化模型。下面是在线和离线推理的示例。" + +#: ../../user_guide/feature_guide/quantization.md:64 +msgid "Offline inference" +msgstr "离线推理" + +#: ../../user_guide/feature_guide/quantization.md:90 +msgid "Online inference" +msgstr "在线推理" + +#: ../../user_guide/feature_guide/quantization.md:97 +msgid "FAQs" +msgstr "常见问题解答" + +#: ../../user_guide/feature_guide/quantization.md:99 +msgid "" +"1. How to solve the KeyError: 'xxx.layers.0.self_attn.q_proj.weight' " +"problem?" +msgstr "1. 如何解决 KeyError: 'xxx.layers.0.self_attn.q_proj.weight' 问题?" + +#: ../../user_guide/feature_guide/quantization.md:101 +msgid "" +"First, make sure you specify `ascend` quantization method. Second, check if " +"your model is converted by this `modelslim-VLLM-8.1.RC1.b020_001` modelslim " +"version. Finally, if it still doesn't work, please submit a issue, maybe " +"some new models need to be adapted." +msgstr "" +"首先,请确保你指定了 `ascend` 量化方法。其次,检查你的模型是否由 `modelslim-VLLM-8.1.RC1.b020_001` 这个 " +"modelslim 版本转换。如果仍然无法使用,请提交一个 issue,可能有一些新模型需要适配。" + +#: ../../user_guide/feature_guide/quantization.md:104 +msgid "" +"2. How to solve the error \"Could not locate the " +"configuration_deepseek.py\"?" +msgstr "2. 如何解决“无法找到 configuration_deepseek.py”错误?" + +#: ../../user_guide/feature_guide/quantization.md:106 +msgid "" +"Please convert DeepSeek series models using `modelslim-" +"VLLM-8.1.RC1.b020_001` modelslim, this version has fixed the missing " +"configuration_deepseek.py error." +msgstr "" +"请使用 `modelslim-VLLM-8.1.RC1.b020_001` 的 modelslim 转换 DeepSeek 系列模型,该版本已修复缺少 " +"configuration_deepseek.py 的错误。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po new file mode 100644 index 0000000..a3bd1b2 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/sleep_mode.po @@ -0,0 +1,156 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/sleep_mode.md:1 +msgid "Sleep Mode Guide" +msgstr "睡眠模式指南" + +#: ../../user_guide/feature_guide/sleep_mode.md:3 +msgid "Overview" +msgstr "概述" + +#: ../../user_guide/feature_guide/sleep_mode.md:5 +msgid "" +"Sleep Mode is an API designed to offload model weights and discard KV cache " +"from NPU memory. This functionality is essential for reinforcement learning " +"(RL) post-training workloads, particularly in online algorithms such as PPO," +" GRPO, or DPO. During training, the policy model typically performs auto-" +"regressive generation using inference engines like vLLM, followed by forward" +" and backward passes for optimization." +msgstr "" +"Sleep Mode 是一个用于卸载模型权重并清除 NPU 内存中 KV 缓存的 API。此功能对于强化学习(RL)后训练任务尤其重要,特别是在 " +"PPO、GRPO 或 DPO 等在线算法中。在训练过程中,策略模型通常会使用像 vLLM " +"这样的推理引擎进行自回归生成,然后进行前向和反向传播以进行优化。" + +#: ../../user_guide/feature_guide/sleep_mode.md:7 +msgid "" +"Since the generation and training phases may employ different model " +"parallelism strategies, it becomes crucial to free KV cache and even offload" +" model parameters stored within vLLM during training. This ensures efficient" +" memory utilization and avoids resource contention on the NPU." +msgstr "" +"由于生成和训练阶段可能采用不同的模型并行策略,因此在训练过程中及时释放 KV 缓存,甚至卸载存储在 vLLM " +"内的模型参数变得至关重要。这可以确保内存的高效利用,并避免 NPU 上的资源争用。" + +#: ../../user_guide/feature_guide/sleep_mode.md:10 +msgid "Getting started" +msgstr "快速上手" + +#: ../../user_guide/feature_guide/sleep_mode.md:12 +#, python-brace-format +msgid "" +"With `enable_sleep_mode=True`, the way we manage memory(malloc, free) in " +"vllm will under a specific memory pool, during loading model and initialize " +"kv_caches, we tag the memory as a map: `{\"weight\": data, \"kv_cache\": " +"data}`." +msgstr "" +"当 `enable_sleep_mode=True` 时,我们在 vllm 中管理内存(malloc, " +"free)的方式会在一个特定的内存池下进行,在加载模型和初始化 kv_caches " +"期间,我们会将内存打上标签,组织成一个映射:`{\"weight\": data, \"kv_cache\": data}`。" + +#: ../../user_guide/feature_guide/sleep_mode.md:14 +msgid "" +"The engine(v0/v1) supports two sleep levels to manage memory during idle " +"periods:" +msgstr "该引擎(v0/v1)支持两种睡眠等级,以在空闲期间管理内存:" + +#: ../../user_guide/feature_guide/sleep_mode.md:16 +msgid "Level 1 Sleep" +msgstr "一级睡眠" + +#: ../../user_guide/feature_guide/sleep_mode.md:17 +msgid "Action: Offloads model weights and discards the KV cache." +msgstr "操作:卸载模型权重并清除KV缓存。" + +#: ../../user_guide/feature_guide/sleep_mode.md:18 +msgid "Memory: Model weights are moved to CPU memory; KV cache is forgotten." +msgstr "内存:模型权重被移动到CPU内存;KV缓存被清除。" + +#: ../../user_guide/feature_guide/sleep_mode.md:19 +msgid "Use Case: Suitable when reusing the same model later." +msgstr "用例:适用于之后需要重复使用同一个模型的情况。" + +#: ../../user_guide/feature_guide/sleep_mode.md:20 +msgid "" +"Note: Ensure sufficient CPU memory is available to hold the model weights." +msgstr "注意:请确保有足够的CPU内存来存储模型权重。" + +#: ../../user_guide/feature_guide/sleep_mode.md:22 +msgid "Level 2 Sleep" +msgstr "二级睡眠" + +#: ../../user_guide/feature_guide/sleep_mode.md:23 +msgid "Action: Discards both model weights and KV cache." +msgstr "操作:同时丢弃模型权重和KV缓存。" + +#: ../../user_guide/feature_guide/sleep_mode.md:24 +msgid "" +"Memory: The content of both the model weights and kv cache is forgotten." +msgstr "内存:模型权重和kv缓存的内容都会被遗忘。" + +#: ../../user_guide/feature_guide/sleep_mode.md:25 +msgid "" +"Use Case: Ideal when switching to a different model or updating the current " +"one." +msgstr "用例:当切换到不同的模型或更新当前模型时非常理想。" + +#: ../../user_guide/feature_guide/sleep_mode.md:27 +msgid "" +"Since this feature uses the low-level API " +"[AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html)," +" in order to use sleep mode, you should follow the [installation " +"guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) and " +"building from source, if you are using v0.7.3, remember to set `export " +"COMPILE_CUSTOM_KERNELS=1`, for the latest version(v0.9.x+), the environment " +"variable `COMPILE_CUSTOM_KERNELS` will be set 1 by default while building " +"from source." +msgstr "" +"由于此功能使用了底层 API " +"[AscendCL](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/API/appdevgapi/appdevgapi_07_0000.html),为了使用休眠模式,你应按照[安装指南](https://vllm-" +"ascend.readthedocs.io/en/latest/installation.html)进行操作,并从源码编译。如果你使用的是 " +"v0.7.3,请记得设置 `export COMPILE_CUSTOM_KERNELS=1` ;对于最新版本(v0.9.x+),在从源码编译时环境变量 " +"`COMPILE_CUSTOM_KERNELS` 默认会被设置为 1。" + +#: ../../user_guide/feature_guide/sleep_mode.md:29 +msgid "Usage" +msgstr "用法" + +#: ../../user_guide/feature_guide/sleep_mode.md:31 +msgid "The following is a simple example of how to use sleep mode." +msgstr "以下是如何使用睡眠模式的一个简单示例。" + +#: ../../user_guide/feature_guide/sleep_mode.md:33 +msgid "offline inference:" +msgstr "离线推理:" + +#: ../../user_guide/feature_guide/sleep_mode.md:72 +msgid "online serving:" +msgstr "在线服务:" + +#: ../../user_guide/feature_guide/sleep_mode.md:74 +msgid "" +"Considering there may be a risk of malicious access, please make sure you " +"are under a dev-mode, and explicit specify the develop env: " +"`VLLM_SERVER_DEV_MODE` to expose these endpoints(sleep/wake up)." +msgstr "" +"鉴于可能存在恶意访问的风险,请确保您处于开发模式,并明确指定开发环境:`VLLM_SERVER_DEV_MODE`,以便开放这些端点(sleep/wake" +" up)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po new file mode 100644 index 0000000..636e59d --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/feature_guide/structured_output.po @@ -0,0 +1,220 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/feature_guide/structured_output.md:1 +msgid "Structured Output Guide" +msgstr "结构化输出指南" + +#: ../../user_guide/feature_guide/structured_output.md:3 +msgid "Overview" +msgstr "概述" + +#: ../../user_guide/feature_guide/structured_output.md:5 +msgid "What is Structured Output?" +msgstr "什么是结构化输出?" + +#: ../../user_guide/feature_guide/structured_output.md:7 +msgid "" +"LLMs can be unpredictable when you need output in specific formats. Think of" +" asking a model to generate JSON - without guidance, it might produce valid " +"text that breaks JSON specification. **Structured Output (also called Guided" +" Decoding)** enables LLMs to generate outputs that follow a desired " +"structure while preserving the non-deterministic nature of the system." +msgstr "" +"当你需要特定格式输出时,大型语言模型(LLMs)可能表现出不可预测性。比如让模型生成 " +"JSON,如果没有指导,模型可能会生成有效的文本,但这些文本却不符合 JSON 规范。**结构化输出(也称为引导解码)** " +"能让大型语言模型生成符合预期结构的输出,同时保留系统的非确定性特性。" + +#: ../../user_guide/feature_guide/structured_output.md:9 +msgid "" +"In simple terms, structured decoding gives LLMs a “template” to follow. " +"Users provide a schema that “influences” the model’s output, ensuring " +"compliance with the desired structure." +msgstr "简单来说,结构化解码为LLM提供了一个“模板”来遵循。用户提供一个模式来“影响”模型的输出,从而确保输出符合期望的结构。" + +#: ../../user_guide/feature_guide/structured_output.md:11 +msgid "![structured decoding](./images/structured_output_1.png)" +msgstr "![结构化解码](./images/structured_output_1.png)" + +#: ../../user_guide/feature_guide/structured_output.md:11 +msgid "structured decoding" +msgstr "结构化解码" + +#: ../../user_guide/feature_guide/structured_output.md:13 +msgid "Structured Output in vllm-ascend" +msgstr "vllm-ascend 中的结构化输出" + +#: ../../user_guide/feature_guide/structured_output.md:15 +msgid "" +"Currently, vllm-ascend supports **xgrammar** and **guidance** backend for " +"structured output with vllm v1 engine." +msgstr "目前,vllm-ascend 支持 vllm v1 引擎的结构化输出,后端包括 **xgrammar** 和 **guidance**。" + +#: ../../user_guide/feature_guide/structured_output.md:17 +msgid "" +"XGrammar introduces a new technique that batch constrained decoding via " +"pushdown automaton (PDA). You can think of a PDA as a “collection of FSMs, " +"and each FSM represents a context-free grammar (CFG).” One significant " +"advantage of PDA is its recursive nature, allowing us to execute multiple " +"state transitions. They also include additional optimisation (for those who " +"are interested) to reduce grammar compilation overhead. Besides, you can " +"also find more details about guidance by yourself." +msgstr "" +"XGrammar 引入了一种通过下推自动机(PDA)进行批量约束解码的新技术。你可以把 PDA 理解为“有限状态机(FSM)的集合,每个 FSM " +"代表一个上下文无关文法(CFG)。” PDA 的一个重要优点是其递归特性,使我们能够执行多次状态转移。此外,PDA " +"还包含了额外的优化(供感兴趣的用户参考),以减少语法编译的开销。除此之外,你还可以自己找到更多关于指导的信息。" + +#: ../../user_guide/feature_guide/structured_output.md:19 +msgid "How to Use Structured Output?" +msgstr "如何使用结构化输出?" + +#: ../../user_guide/feature_guide/structured_output.md:21 +msgid "Online Inference" +msgstr "在线推理" + +#: ../../user_guide/feature_guide/structured_output.md:23 +msgid "" +"You can also generate structured outputs using the OpenAI's Completions and " +"Chat API. The following parameters are supported, which must be added as " +"extra parameters:" +msgstr "你也可以使用 OpenAI 的 Completions 和 Chat API 生成结构化输出。支持以下参数,这些参数必须作为额外参数添加:" + +#: ../../user_guide/feature_guide/structured_output.md:25 +msgid "`guided_choice`: the output will be exactly one of the choices." +msgstr "`guided_choice`:输出将会是其中一个选项。" + +#: ../../user_guide/feature_guide/structured_output.md:26 +msgid "`guided_regex`: the output will follow the regex pattern." +msgstr "`guided_regex`:输出将遵循正则表达式模式。" + +#: ../../user_guide/feature_guide/structured_output.md:27 +msgid "`guided_json`: the output will follow the JSON schema." +msgstr "`guided_json`:输出将遵循 JSON 架构。" + +#: ../../user_guide/feature_guide/structured_output.md:28 +msgid "`guided_grammar`: the output will follow the context free grammar." +msgstr "`guided_grammar`:输出将遵循上下文无关文法。" + +#: ../../user_guide/feature_guide/structured_output.md:30 +msgid "" +"Structured outputs are supported by default in the OpenAI-Compatible Server." +" You can choose to specify the backend to use by setting the `--guided-" +"decoding-backend` flag to vllm serve. The default backend is `auto`, which " +"will try to choose an appropriate backend based on the details of the " +"request. You may also choose a specific backend, along with some options." +msgstr "" +"OpenAI 兼容服务器默认支持结构化输出。你可以通过设置 `--guided-decoding-backend` 标志为 vllm serve " +"来指定要使用的后端。默认后端为 `auto`,它会根据请求的详细信息尝试选择合适的后端。你也可以选择特定的后端,并设置一些选项。" + +#: ../../user_guide/feature_guide/structured_output.md:32 +msgid "" +"Now let´s see an example for each of the cases, starting with the " +"guided_choice, as it´s the easiest one:" +msgstr "现在让我们来看每种情况的示例,首先是 guided_choice,因为它是最简单的:" + +#: ../../user_guide/feature_guide/structured_output.md:51 +msgid "" +"The next example shows how to use the guided_regex. The idea is to generate " +"an email address, given a simple regex template:" +msgstr "下一个例子展示了如何使用 guided_regex。其思路是基于一个简单的正则表达式模板生成一个电子邮件地址:" + +#: ../../user_guide/feature_guide/structured_output.md:67 +msgid "" +"One of the most relevant features in structured text generation is the " +"option to generate a valid JSON with pre-defined fields and formats. For " +"this we can use the guided_json parameter in two different ways:" +msgstr "" +"在结构化文本生成中,最相关的特性之一是能够生成具有预定义字段和格式的有效 JSON。为此,我们可以通过两种不同的方式使用 guided_json 参数:" + +#: ../../user_guide/feature_guide/structured_output.md:69 +msgid "Using a JSON Schema." +msgstr "使用 JSON 架构。" + +#: ../../user_guide/feature_guide/structured_output.md:70 +msgid "Defining a Pydantic model and then extracting the JSON Schema from it." +msgstr "定义一个 Pydantic 模型,然后从中提取 JSON Schema。" + +#: ../../user_guide/feature_guide/structured_output.md:72 +msgid "" +"The next example shows how to use the guided_json parameter with a Pydantic " +"model:" +msgstr "下一个示例展示了如何将 guided_json 参数与 Pydantic 模型一起使用:" + +#: ../../user_guide/feature_guide/structured_output.md:104 +msgid "" +"Finally we have the guided_grammar option, which is probably the most " +"difficult to use, but it´s really powerful. It allows us to define complete " +"languages like SQL queries. It works by using a context free EBNF grammar. " +"As an example, we can use to define a specific format of simplified SQL " +"queries:" +msgstr "" +"最后,我们有 guided_grammar 选项,这可能是最难使用的,但它非常强大。它允许我们定义完整的语言,比如 SQL 查询。它通过使用上下文无关的" +" EBNF 语法来实现。例如,我们可以用它来定义一种简化 SQL 查询的特定格式:" + +#: ../../user_guide/feature_guide/structured_output.md:134 +msgid "" +"Find more examples [here](https://github.com/vllm-" +"project/vllm/blob/main/examples/offline_inference/structured_outputs.py)." +msgstr "" +"在[这里](https://github.com/vllm-" +"project/vllm/blob/main/examples/offline_inference/structured_outputs.py)可以找到更多示例。" + +#: ../../user_guide/feature_guide/structured_output.md:136 +msgid "Offline Inference" +msgstr "离线推理" + +#: ../../user_guide/feature_guide/structured_output.md:138 +msgid "" +"To use Structured Output, we'll need to configure the guided decoding using " +"the class `GuidedDecodingParams` inside `SamplingParams`. The main available" +" options inside `GuidedDecodingParams` are:" +msgstr "" +"要使用结构化输出,我们需要在 `SamplingParams` 内通过 `GuidedDecodingParams` " +"类配置引导解码。`GuidedDecodingParams` 中主要可用的选项有:" + +#: ../../user_guide/feature_guide/structured_output.md:140 +msgid "json" +msgstr "json" + +#: ../../user_guide/feature_guide/structured_output.md:141 +msgid "regex" +msgstr "正则表达式" + +#: ../../user_guide/feature_guide/structured_output.md:142 +msgid "choice" +msgstr "选择" + +#: ../../user_guide/feature_guide/structured_output.md:143 +msgid "grammar" +msgstr "语法" + +#: ../../user_guide/feature_guide/structured_output.md:145 +msgid "One example for the usage of the choice parameter is shown below:" +msgstr "choice 参数用法的一个示例如下:" + +#: ../../user_guide/feature_guide/structured_output.md:163 +msgid "" +"Find more examples of other usages [here](https://github.com/vllm-" +"project/vllm/blob/main/examples/offline_inference/structured_outputs.py)." +msgstr "" +"查看更多其他用法的示例 [在这里](https://github.com/vllm-" +"project/vllm/blob/main/examples/offline_inference/structured_outputs.py)。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po new file mode 100644 index 0000000..ddd1eb5 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/release_notes.po @@ -0,0 +1,1660 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: 2025-07-18 10:11+0800\n" +"Last-Translator: \n" +"Language-Team: \n" +"Language: zh\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" +"X-Generator: Poedit 3.5\n" + +#: ../../user_guide/release_notes.md:1 +msgid "Release note" +msgstr "版本说明" + +#: ../../user_guide/release_notes.md:3 +msgid "v0.9.2rc1 - 2025.07.11" +msgstr "" + +#: ../../user_guide/release_notes.md:5 +msgid "" +"This is the 1st release candidate of v0.9.2 for vLLM Ascend. Please follow " +"the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started. " +"From this release, V1 engine will be enabled by default, there is no need " +"to set `VLLM_USE_V1=1` any more. And this release is the last version to " +"support V0 engine, V0 code will be clean up in the future." +msgstr "" +"这是 vLLM Ascend v0.9.2 的第一个候选发布版本。请参阅[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。从本次发布起,V1 引擎将默认启用,不再需" +"要设置 `VLLM_USE_V1=1`。此外,该版本也是最后一个支持 V0 引擎的版本,V0 相关" +"代码将在未来被清理。" + +#: ../../user_guide/release_notes.md:7 ../../user_guide/release_notes.md:34 +#: ../../user_guide/release_notes.md:70 ../../user_guide/release_notes.md:78 +#: ../../user_guide/release_notes.md:116 ../../user_guide/release_notes.md:140 +#: ../../user_guide/release_notes.md:163 ../../user_guide/release_notes.md:186 +#: ../../user_guide/release_notes.md:206 ../../user_guide/release_notes.md:231 +#: ../../user_guide/release_notes.md:253 ../../user_guide/release_notes.md:285 +msgid "Highlights" +msgstr "亮点" + +#: ../../user_guide/release_notes.md:8 +msgid "" +"Pooling model works with V1 engine now. You can take a try with Qwen3 " +"embedding model [#1359](https://github.com/vllm-project/vllm-ascend/" +"pull/1359)." +msgstr "" +"Pooling 模型现在可以与 V1 引擎一起使用。你可以尝试使用 Qwen3 embedding 模型 " +"[#1359](https://github.com/vllm-project/vllm-ascend/pull/1359)。" + +#: ../../user_guide/release_notes.md:9 +msgid "" +"The performance on Atlas 300I series has been improved. [#1591](https://" +"github.com/vllm-project/vllm-ascend/pull/1591)" +msgstr "" +"Atlas 300I 系列的性能已经提升。 [#1591](https://github.com/vllm-project/" +"vllm-ascend/pull/1591)" + +#: ../../user_guide/release_notes.md:10 +msgid "" +"aclgraph mode works with Moe models now. Currently, only Qwen3 Moe is well " +"tested. [#1381](https://github.com/vllm-project/vllm-ascend/pull/1381)" +msgstr "" +"aclgraph 模式现在可以与 Moe 模型一起使用。目前,仅对 Qwen3 Moe 进行了充分测" +"试。[#1381](https://github.com/vllm-project/vllm-ascend/pull/1381)" + +#: ../../user_guide/release_notes.md:12 ../../user_guide/release_notes.md:39 +#: ../../user_guide/release_notes.md:83 ../../user_guide/release_notes.md:146 +#: ../../user_guide/release_notes.md:168 ../../user_guide/release_notes.md:191 +#: ../../user_guide/release_notes.md:212 ../../user_guide/release_notes.md:236 +#: ../../user_guide/release_notes.md:258 ../../user_guide/release_notes.md:291 +msgid "Core" +msgstr "核心" + +#: ../../user_guide/release_notes.md:13 +msgid "" +"Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1." +"dev20250619`. Don’t forget to update it in your environment. [#1347]" +"(https://github.com/vllm-project/vllm-ascend/pull/1347)" +msgstr "" +"Ascend PyTorch 适配器(torch_npu)已升级到 `2.5.1.post1.dev20250619`。请不要" +"忘记在您的环境中进行更新。 [#1347](https://github.com/vllm-project/vllm-" +"ascend/pull/1347)" + +#: ../../user_guide/release_notes.md:14 +msgid "" +"The **GatherV3** error has been fixed with **aclgraph** mode. [#1416]" +"(https://github.com/vllm-project/vllm-ascend/pull/1416)" +msgstr "" +"**GatherV3** 错误已通过 **aclgraph** 模式修复。[#1416](https://github.com/" +"vllm-project/vllm-ascend/pull/1416)" + +#: ../../user_guide/release_notes.md:15 +msgid "" +"W8A8 quantization works on Atlas 300I series now. [#1560](https://github." +"com/vllm-project/vllm-ascend/pull/1560)" +msgstr "" +"W8A8 量化现在可以在 Atlas 300I 系列上运行了。[#1560](https://github.com/" +"vllm-project/vllm-ascend/pull/1560)" + +#: ../../user_guide/release_notes.md:16 +msgid "" +"Fix the accuracy problem with deploy models with parallel parameters. " +"[#1678](https://github.com/vllm-project/vllm-ascend/pull/1678)" +msgstr "" +"修复了使用并行参数部署模型时的准确性问题。[#1678](https://github.com/vllm-" +"project/vllm-ascend/pull/1678)" + +#: ../../user_guide/release_notes.md:17 +msgid "" +"The pre-built wheel package now requires lower version of glibc. Users can " +"use it by `pip install vllm-ascend` directly. [#1582](https://github.com/" +"vllm-project/vllm-ascend/pull/1582)" +msgstr "" +"预编译的 wheel 包现在要求更低版本的 glibc。用户可以直接通过 `pip install " +"vllm-ascend` 使用它。[#1582](https://github.com/vllm-project/vllm-ascend/" +"pull/1582)" + +#: ../../user_guide/release_notes.md:19 ../../user_guide/release_notes.md:99 +#: ../../user_guide/release_notes.md:153 ../../user_guide/release_notes.md:177 +#: ../../user_guide/release_notes.md:195 ../../user_guide/release_notes.md:219 +#: ../../user_guide/release_notes.md:242 ../../user_guide/release_notes.md:266 +#: ../../user_guide/release_notes.md:296 +msgid "Other" +msgstr "其它" + +#: ../../user_guide/release_notes.md:20 +msgid "" +"Official doc has been updated for better read experience. For example, more " +"deployment tutorials are added, user/developer docs are updated. More guide " +"will coming soon." +msgstr "" +"官方文档已更新,以提升阅读体验。例如,增加了更多部署教程,用户/开发者文档已" +"更新。更多指南即将推出。" + +#: ../../user_guide/release_notes.md:21 +msgid "" +"Fix accuracy problem for deepseek V3/R1 models with torchair graph in long " +"sequence predictions. [#1331](https://github.com/vllm-project/vllm-ascend/" +"pull/1331)" +msgstr "" +"修复 deepseek V3/R1 模型在使用 torchair 图进行长序列预测时的精度问题。" +"[#1331](https://github.com/vllm-project/vllm-ascend/pull/1331)" + +#: ../../user_guide/release_notes.md:22 +msgid "" +"A new env variable `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` has been added. " +"It enables the fused allgather-experts kernel for Deepseek V3/R1 models. " +"The default value is `0`. [#1335](https://github.com/vllm-project/vllm-" +"ascend/pull/1335)" +msgstr "" +"新增了一个环境变量 `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP`。它用于启用 " +"Deepseek V3/R1 模型的 fused allgather-experts 内核。默认值为 `0`。[#1335]" +"(https://github.com/vllm-project/vllm-ascend/pull/1335)" + +#: ../../user_guide/release_notes.md:23 +msgid "" +"A new env variable `VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION` has been " +"added to improve the performance of topk-topp sampling. The default value " +"is 0, we'll consider to enable it by default in the future[#1732](https://" +"github.com/vllm-project/vllm-ascend/pull/1732)" +msgstr "" +"新增了一个环境变量 `VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION`,用于提升 " +"topk-topp 采样的性能。该变量默认值为 0,未来我们会考虑默认启用此选项[#1732]" +"(https://github.com/vllm-project/vllm-ascend/pull/1732)。" + +#: ../../user_guide/release_notes.md:24 +msgid "" +"A batch of bugs have been fixed for Data Parallelism case [#1273](https://" +"github.com/vllm-project/vllm-ascend/pull/1273) [#1322](https://github.com/" +"vllm-project/vllm-ascend/pull/1322) [#1275](https://github.com/vllm-project/" +"vllm-ascend/pull/1275) [#1478](https://github.com/vllm-project/vllm-ascend/" +"pull/1478)" +msgstr "" +"已修复了一批与数据并行相关的 bug [#1273](https://github.com/vllm-project/" +"vllm-ascend/pull/1273) [#1322](https://github.com/vllm-project/vllm-ascend/" +"pull/1322) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) " +"[#1478](https://github.com/vllm-project/vllm-ascend/pull/1478)" + +#: ../../user_guide/release_notes.md:25 +msgid "" +"The DeepSeek performance has been improved. [#1194](https://github.com/vllm-" +"project/vllm-ascend/pull/1194) [#1395](https://github.com/vllm-project/vllm-" +"ascend/pull/1395) [#1380](https://github.com/vllm-project/vllm-ascend/" +"pull/1380)" +msgstr "" +"DeepSeek 的性能已得到提升。[#1194](https://github.com/vllm-project/vllm-" +"ascend/pull/1194) [#1395](https://github.com/vllm-project/vllm-ascend/" +"pull/1395) [#1380](https://github.com/vllm-project/vllm-ascend/pull/1380)" + +#: ../../user_guide/release_notes.md:26 +msgid "" +"Ascend scheduler works with prefix cache now. [#1446](https://github.com/" +"vllm-project/vllm-ascend/pull/1446)" +msgstr "" +"Ascend 调度器现在支持前缀缓存。[#1446](https://github.com/vllm-project/vllm-" +"ascend/pull/1446)" + +#: ../../user_guide/release_notes.md:27 +msgid "" +"DeepSeek now works with prefix cache now. [#1498](https://github.com/vllm-" +"project/vllm-ascend/pull/1498)" +msgstr "" +"DeepSeek 现在支持前缀缓存了。[#1498](https://github.com/vllm-project/vllm-" +"ascend/pull/1498)" + +#: ../../user_guide/release_notes.md:28 +msgid "" +"Support prompt logprobs to recover ceval accuracy in V1 [#1483](https://" +"github.com/vllm-project/vllm-ascend/pull/1483)" +msgstr "" +"支持使用 prompt logprobs 恢复 V1 的 ceval 准确率 [#1483](https://github.com/" +"vllm-project/vllm-ascend/pull/1483)" + +#: ../../user_guide/release_notes.md:30 +msgid "v0.9.1rc1 - 2025.06.22" +msgstr "v0.9.1rc1 - 2025.06.22" + +#: ../../user_guide/release_notes.md:32 +msgid "" +"This is the 1st release candidate of v0.9.1 for vLLM Ascend. Please follow " +"the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started." +msgstr "" +"这是 vLLM Ascend v0.9.1 的第一个候选发布版本。请按照[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。" + +#: ../../user_guide/release_notes.md:36 +msgid "" +"Atlas 300I series is experimental supported in this release. [#1333]" +"(https://github.com/vllm-project/vllm-ascend/pull/1333) After careful " +"consideration, this feature **will NOT be included in v0.9.1-dev branch** " +"taking into account the v0.9.1 release quality and the feature rapid " +"iteration to improve performance on Atlas 300I series. We will improve this " +"from 0.9.2rc1 and later." +msgstr "" +"本版本对 Atlas 300I 系列提供了实验性支持。[#1333](https://github.com/vllm-" +"project/vllm-ascend/pull/1333) 经过慎重考虑,鉴于 v0.9.1 版本发布的质量要求" +"以及 Atlas 300I 系列性能优化的快速迭代,该功能**不会被包含在 v0.9.1-dev 分支" +"中**。我们将在 0.9.2rc1 及之后的版本中进一步完善该功能。" + +#: ../../user_guide/release_notes.md:37 +msgid "" +"Support EAGLE-3 for speculative decoding. [#1032](https://github.com/vllm-" +"project/vllm-ascend/pull/1032)" +msgstr "" +"支持 EAGLE-3 进行推测式解码。[#1032](https://github.com/vllm-project/vllm-" +"ascend/pull/1032)" + +#: ../../user_guide/release_notes.md:40 +msgid "" +"Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1." +"dev20250528`. Don’t forget to update it in your environment. [#1235]" +"(https://github.com/vllm-project/vllm-ascend/pull/1235)" +msgstr "" +"Ascend PyTorch 适配器(torch_npu)已升级到 `2.5.1.post1.dev20250528`。请不要" +"忘记在您的环境中进行更新。[#1235](https://github.com/vllm-project/vllm-" +"ascend/pull/1235)" + +#: ../../user_guide/release_notes.md:41 +msgid "" +"Support Atlas 300I series container image. You can get it from [quay.io]" +"(https://quay.io/repository/vllm/vllm-ascend)" +msgstr "" +"支持Atlas 300I系列的容器镜像。你可以从[quay.io](https://quay.io/repository/" +"vllm/vllm-ascend)获取。" + +#: ../../user_guide/release_notes.md:42 +msgid "" +"Fix token-wise padding mechanism to make multi-card graph mode work. [#1300]" +"(https://github.com/vllm-project/vllm-ascend/pull/1300)" +msgstr "" +"修复按 token 填充机制以支持多卡图模式。 [#1300](https://github.com/vllm-" +"project/vllm-ascend/pull/1300)" + +#: ../../user_guide/release_notes.md:43 +msgid "" +"Upgrade vllm to 0.9.1 [#1165]https://github.com/vllm-project/vllm-ascend/" +"pull/1165" +msgstr "" +"将 vllm 升级到 0.9.1 [#1165]https://github.com/vllm-project/vllm-ascend/" +"pull/1165" + +#: ../../user_guide/release_notes.md:45 +msgid "Other Improvements" +msgstr "其他改进" + +#: ../../user_guide/release_notes.md:46 +msgid "" +"Initial support Chunked Prefill for MLA. [#1172](https://github.com/vllm-" +"project/vllm-ascend/pull/1172)" +msgstr "" +"为MLA初步支持分块预填充。 [#1172](https://github.com/vllm-project/vllm-" +"ascend/pull/1172)" + +#: ../../user_guide/release_notes.md:47 +msgid "" +"An example of best practices to run DeepSeek with ETP has been added. " +"[#1101](https://github.com/vllm-project/vllm-ascend/pull/1101)" +msgstr "" +"已新增一个使用 ETP 运行 DeepSeek 的最佳实践示例。[#1101](https://github.com/" +"vllm-project/vllm-ascend/pull/1101)" + +#: ../../user_guide/release_notes.md:48 +msgid "" +"Performance improvements for DeepSeek using the TorchAir graph. [#1098]" +"(https://github.com/vllm-project/vllm-ascend/pull/1098), [#1131](https://" +"github.com/vllm-project/vllm-ascend/pull/1131)" +msgstr "" +"通过使用 TorchAir 图对 DeepSeek 进行了性能提升。[#1098](https://github.com/" +"vllm-project/vllm-ascend/pull/1098), [#1131](https://github.com/vllm-" +"project/vllm-ascend/pull/1131)" + +#: ../../user_guide/release_notes.md:49 +msgid "" +"Supports the speculative decoding feature with AscendScheduler. [#943]" +"(https://github.com/vllm-project/vllm-ascend/pull/943)" +msgstr "" +"支持 AscendScheduler 的预测性解码功能。[#943](https://github.com/vllm-" +"project/vllm-ascend/pull/943)" + +#: ../../user_guide/release_notes.md:50 +msgid "" +"Improve `VocabParallelEmbedding` custom op performance. It will be enabled " +"in the next release. [#796](https://github.com/vllm-project/vllm-ascend/" +"pull/796)" +msgstr "" +"提升 `VocabParallelEmbedding` 自定义算子的性能。该优化将在下一个版本中启用。" +"[#796](https://github.com/vllm-project/vllm-ascend/pull/796)" + +#: ../../user_guide/release_notes.md:51 +msgid "" +"Fixed a device discovery and setup bug when running vLLM Ascend on Ray " +"[#884](https://github.com/vllm-project/vllm-ascend/pull/884)" +msgstr "" +"修复了在 Ray 上运行 vLLM Ascend 时的设备发现和设置错误 [#884](https://" +"github.com/vllm-project/vllm-ascend/pull/884)" + +#: ../../user_guide/release_notes.md:52 +msgid "" +"DeepSeek with [MC2](https://www.hiascend.com/document/detail/zh/" +"canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/" +"atlas_ascendc_best_practices_10_0043.html) (Merged Compute and " +"Communication) now works properly. [#1268](https://github.com/vllm-project/" +"vllm-ascend/pull/1268)" +msgstr "" +"DeepSeek 现已可以与 [MC2](https://www.hiascend.com/document/detail/zh/" +"canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/" +"atlas_ascendc_best_practices_10_0043.html)(计算与通信融合)正常工作。" +"[#1268](https://github.com/vllm-project/vllm-ascend/pull/1268)" + +#: ../../user_guide/release_notes.md:53 +msgid "" +"Fixed log2phy NoneType bug with static EPLB feature. [#1186](https://github." +"com/vllm-project/vllm-ascend/pull/1186)" +msgstr "" +"修复了带有静态 EPLB 特性时 log2phy 为 NoneType 的 bug。[#1186](https://" +"github.com/vllm-project/vllm-ascend/pull/1186)" + +#: ../../user_guide/release_notes.md:54 +msgid "" +"Improved performance for DeepSeek with DBO enabled. [#997](https://github." +"com/vllm-project/vllm-ascend/pull/997), [#1135](https://github.com/vllm-" +"project/vllm-ascend/pull/1135)" +msgstr "" +"启用 DBO 后,DeepSeek 的性能得到提升。[#997](https://github.com/vllm-" +"project/vllm-ascend/pull/997),[#1135](https://github.com/vllm-project/vllm-" +"ascend/pull/1135)" + +#: ../../user_guide/release_notes.md:55 +msgid "" +"Refactoring AscendFusedMoE [#1229](https://github.com/vllm-project/vllm-" +"ascend/pull/1229)" +msgstr "" +"重构 AscendFusedMoE [#1229](https://github.com/vllm-project/vllm-ascend/" +"pull/1229)" + +#: ../../user_guide/release_notes.md:56 +msgid "" +"Add initial user stories page (include LLaMA-Factory/TRL/verl/MindIE Turbo/" +"GPUStack) [#1224](https://github.com/vllm-project/vllm-ascend/pull/1224)" +msgstr "" +"新增初始用户故事页面(包括 LLaMA-Factory/TRL/verl/MindIE Turbo/GPUStack)" +"[#1224](https://github.com/vllm-project/vllm-ascend/pull/1224)" + +#: ../../user_guide/release_notes.md:57 +msgid "" +"Add unit test framework [#1201](https://github.com/vllm-project/vllm-ascend/" +"pull/1201)" +msgstr "" +"添加单元测试框架 [#1201](https://github.com/vllm-project/vllm-ascend/" +"pull/1201)" + +#: ../../user_guide/release_notes.md:59 +msgid "Known Issues" +msgstr "已知问题" + +#: ../../user_guide/release_notes.md:60 +msgid "" +"In some cases, the vLLM process may crash with a **GatherV3** error when " +"**aclgraph** is enabled. We are working on this issue and will fix it in " +"the next release. [#1038](https://github.com/vllm-project/vllm-ascend/" +"issues/1038)" +msgstr "" +"在某些情况下,当启用 **aclgraph** 时,vLLM 进程可能会因 **GatherV3** 错误而" +"崩溃。我们正在解决此问题,并将在下一个版本中修复。[#1038](https://github." +"com/vllm-project/vllm-ascend/issues/1038)" + +#: ../../user_guide/release_notes.md:61 +msgid "" +"Prefix cache feature does not work with the Ascend Scheduler but without " +"chunked prefill enabled. This will be fixed in the next release. [#1350]" +"(https://github.com/vllm-project/vllm-ascend/issues/1350)" +msgstr "" +"前缀缓存功能在未启用分块预填充的情况下无法与 Ascend 调度器一同工作。此问题将" +"在下一个版本中修复。[#1350](https://github.com/vllm-project/vllm-ascend/" +"issues/1350)" + +#: ../../user_guide/release_notes.md:63 +msgid "Full Changelog" +msgstr "完整更新日志" + +#: ../../user_guide/release_notes.md:64 +msgid "" +"https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1" +msgstr "" +"https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1" + +#: ../../user_guide/release_notes.md:66 +msgid "v0.9.0rc2 - 2025.06.10" +msgstr "v0.9.0rc2 - 2025.06.10" + +#: ../../user_guide/release_notes.md:68 +msgid "" +"This release contains some quick fixes for v0.9.0rc1. Please use this " +"release instead of v0.9.0rc1." +msgstr "" +"本次发布包含了一些针对 v0.9.0rc1 的快速修复。请使用本次发布版本,而不是 " +"v0.9.0rc1。" + +#: ../../user_guide/release_notes.md:72 +msgid "" +"Fix the import error when vllm-ascend is installed without editable way. " +"[#1152](https://github.com/vllm-project/vllm-ascend/pull/1152)" +msgstr "" +"修复当以非可编辑方式安装 vllm-ascend 时的导入错误。[#1152](https://github." +"com/vllm-project/vllm-ascend/pull/1152)" + +#: ../../user_guide/release_notes.md:74 +msgid "v0.9.0rc1 - 2025.06.09" +msgstr "v0.9.0rc1 - 2025.06.09" + +#: ../../user_guide/release_notes.md:76 +msgid "" +"This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow " +"the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the " +"journey. From this release, V1 Engine is recommended to use. The code of V0 " +"Engine is frozen and will not be maintained any more. Please set " +"environment `VLLM_USE_V1=1` to enable V1 Engine." +msgstr "" +"这是 vllm-ascend v0.9.0 的第一个候选发布版本。请按照[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。从此版本起,推荐使用 V1 引擎。V0 引擎的" +"代码已被冻结,不再维护。如需启用 V1 引擎,请设置环境变量 `VLLM_USE_V1=1`。" + +#: ../../user_guide/release_notes.md:80 +msgid "" +"DeepSeek works with graph mode now. Follow the [official doc](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html) " +"to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789)" +msgstr "" +"DeepSeek 现在已支持图模式。请按照[官方文档](https://vllm-ascend.readthedocs." +"io/en/latest/user_guide/feature_guide/graph_mode.html)进行尝试。[#789]" +"(https://github.com/vllm-project/vllm-ascend/pull/789)" + +#: ../../user_guide/release_notes.md:81 +msgid "" +"Qwen series models works with graph mode now. It works by default with V1 " +"Engine. Please note that in this release, only Qwen series models are well " +"tested with graph mode. We'll make it stable and generalize in the next " +"release. If you hit any issues, please feel free to open an issue on GitHub " +"and fallback to eager mode temporarily by set `enforce_eager=True` when " +"initializing the model." +msgstr "" +"Qwen 系列模型现在支持图模式。默认情况下,它在 V1 引擎下运行。请注意,本次发" +"布中,仅 Qwen 系列模型经过了充分的图模式测试。我们将在下一个版本中进一步提升" +"其稳定性并推广至更广泛的场景。如果你遇到任何问题,请随时在 GitHub 上提交 " +"issue,并在初始化模型时通过设置 `enforce_eager=True` 临时切换回 eager 模式。" + +#: ../../user_guide/release_notes.md:85 +msgid "" +"The performance of multi-step scheduler has been improved. Thanks for the " +"contribution from China Merchants Bank. [#814](https://github.com/vllm-" +"project/vllm-ascend/pull/814)" +msgstr "" +"多步调度器的性能得到了提升。感谢招商银行的贡献。[#814](https://github.com/" +"vllm-project/vllm-ascend/pull/814)" + +#: ../../user_guide/release_notes.md:86 +msgid "" +"LoRA、Multi-LoRA And Dynamic Serving is supported for V1 Engine now. Thanks " +"for the contribution from China Merchants Bank. [#893](https://github.com/" +"vllm-project/vllm-ascend/pull/893)" +msgstr "" +"V1 引擎现在支持 LoRA、多 LoRA 以及动态服务。感谢招商银行的贡献。[#893]" +"(https://github.com/vllm-project/vllm-ascend/pull/893)" + +#: ../../user_guide/release_notes.md:87 +msgid "" +"Prefix cache and chunked prefill feature works now [#782](https://github." +"com/vllm-project/vllm-ascend/pull/782) [#844](https://github.com/vllm-" +"project/vllm-ascend/pull/844)" +msgstr "" +"前缀缓存和分块预填充功能现已可用 [#782](https://github.com/vllm-project/" +"vllm-ascend/pull/782) [#844](https://github.com/vllm-project/vllm-ascend/" +"pull/844)" + +#: ../../user_guide/release_notes.md:88 +msgid "" +"Spec decode and MTP features work with V1 Engine now. [#874](https://github." +"com/vllm-project/vllm-ascend/pull/874) [#890](https://github.com/vllm-" +"project/vllm-ascend/pull/890)" +msgstr "" +"Spec 解码和 MTP 功能现在已经支持 V1 引擎。[#874](https://github.com/vllm-" +"project/vllm-ascend/pull/874) [#890](https://github.com/vllm-project/vllm-" +"ascend/pull/890)" + +#: ../../user_guide/release_notes.md:89 +msgid "" +"DP feature works with DeepSeek now. [#1012](https://github.com/vllm-project/" +"vllm-ascend/pull/1012)" +msgstr "" +"DP 功能现在可以与 DeepSeek 一起使用。[#1012](https://github.com/vllm-" +"project/vllm-ascend/pull/1012)" + +#: ../../user_guide/release_notes.md:90 +msgid "" +"Input embedding feature works with V0 Engine now. [#916](https://github.com/" +"vllm-project/vllm-ascend/pull/916)" +msgstr "" +"输入嵌入特性现在已支持 V0 引擎。[#916](https://github.com/vllm-project/vllm-" +"ascend/pull/916)" + +#: ../../user_guide/release_notes.md:91 +msgid "" +"Sleep mode feature works with V1 Engine now. [#1084](https://github.com/" +"vllm-project/vllm-ascend/pull/1084)" +msgstr "" +"休眠模式功能现在已支持 V1 引擎。[#1084](https://github.com/vllm-project/" +"vllm-ascend/pull/1084)" + +#: ../../user_guide/release_notes.md:93 ../../user_guide/release_notes.md:149 +#: ../../user_guide/release_notes.md:239 ../../user_guide/release_notes.md:262 +msgid "Model" +msgstr "模型" + +#: ../../user_guide/release_notes.md:95 +msgid "" +"Qwen2.5 VL works with V1 Engine now. [#736](https://github.com/vllm-project/" +"vllm-ascend/pull/736)" +msgstr "" +"Qwen2.5 VL 现在可以与 V1 引擎协同工作。[#736](https://github.com/vllm-" +"project/vllm-ascend/pull/736)" + +#: ../../user_guide/release_notes.md:96 +msgid "" +"LLama4 works now. [#740](https://github.com/vllm-project/vllm-ascend/" +"pull/740)" +msgstr "" +"LLama4 现在可以使用了。[#740](https://github.com/vllm-project/vllm-ascend/" +"pull/740)" + +#: ../../user_guide/release_notes.md:97 +msgid "" +"A new kind of DeepSeek model called dual-batch overlap(DBO) is added. " +"Please set `VLLM_ASCEND_ENABLE_DBO=1` to use it. [#941](https://github.com/" +"vllm-project/vllm-ascend/pull/941)" +msgstr "" +"新增了一种名为双批次重叠(dual-batch overlap,DBO)的 DeepSeek 模型。请设置 " +"`VLLM_ASCEND_ENABLE_DBO=1` 以启用。 [#941](https://github.com/vllm-project/" +"vllm-ascend/pull/941)" + +#: ../../user_guide/release_notes.md:101 +msgid "" +"online serve with ascend quantization works now. [#877](https://github.com/" +"vllm-project/vllm-ascend/pull/877)" +msgstr "" +"在线服务现已支持Ascend量化。[#877](https://github.com/vllm-project/vllm-" +"ascend/pull/877)" + +#: ../../user_guide/release_notes.md:102 +msgid "" +"A batch of bugs for graph mode and moe model have been fixed. [#773]" +"(https://github.com/vllm-project/vllm-ascend/pull/773) [#771](https://" +"github.com/vllm-project/vllm-ascend/pull/771) [#774](https://github.com/" +"vllm-project/vllm-ascend/pull/774) [#816](https://github.com/vllm-project/" +"vllm-ascend/pull/816) [#817](https://github.com/vllm-project/vllm-ascend/" +"pull/817) [#819](https://github.com/vllm-project/vllm-ascend/pull/819) " +"[#912](https://github.com/vllm-project/vllm-ascend/pull/912) [#897](https://" +"github.com/vllm-project/vllm-ascend/pull/897) [#961](https://github.com/" +"vllm-project/vllm-ascend/pull/961) [#958](https://github.com/vllm-project/" +"vllm-ascend/pull/958) [#913](https://github.com/vllm-project/vllm-ascend/" +"pull/913) [#905](https://github.com/vllm-project/vllm-ascend/pull/905)" +msgstr "" +"已修复一批关于图模式和moe模型的bug。[#773](https://github.com/vllm-project/" +"vllm-ascend/pull/773) [#771](https://github.com/vllm-project/vllm-ascend/" +"pull/771) [#774](https://github.com/vllm-project/vllm-ascend/pull/774) " +"[#816](https://github.com/vllm-project/vllm-ascend/pull/816) [#817](https://" +"github.com/vllm-project/vllm-ascend/pull/817) [#819](https://github.com/" +"vllm-project/vllm-ascend/pull/819) [#912](https://github.com/vllm-project/" +"vllm-ascend/pull/912) [#897](https://github.com/vllm-project/vllm-ascend/" +"pull/897) [#961](https://github.com/vllm-project/vllm-ascend/pull/961) " +"[#958](https://github.com/vllm-project/vllm-ascend/pull/958) [#913](https://" +"github.com/vllm-project/vllm-ascend/pull/913) [#905](https://github.com/" +"vllm-project/vllm-ascend/pull/905)" + +#: ../../user_guide/release_notes.md:103 +msgid "" +"A batch of performance improvement PRs have been merged. [#784](https://" +"github.com/vllm-project/vllm-ascend/pull/784) [#803](https://github.com/" +"vllm-project/vllm-ascend/pull/803) [#966](https://github.com/vllm-project/" +"vllm-ascend/pull/966) [#839](https://github.com/vllm-project/vllm-ascend/" +"pull/839) [#970](https://github.com/vllm-project/vllm-ascend/pull/970) " +"[#947](https://github.com/vllm-project/vllm-ascend/pull/947) [#987](https://" +"github.com/vllm-project/vllm-ascend/pull/987) [#1085](https://github.com/" +"vllm-project/vllm-ascend/pull/1085)" +msgstr "" +"一批性能改进的 PR 已被合并。[#784](https://github.com/vllm-project/vllm-" +"ascend/pull/784) [#803](https://github.com/vllm-project/vllm-ascend/" +"pull/803) [#966](https://github.com/vllm-project/vllm-ascend/pull/966) " +"[#839](https://github.com/vllm-project/vllm-ascend/pull/839) [#970](https://" +"github.com/vllm-project/vllm-ascend/pull/970) [#947](https://github.com/" +"vllm-project/vllm-ascend/pull/947) [#987](https://github.com/vllm-project/" +"vllm-ascend/pull/987) [#1085](https://github.com/vllm-project/vllm-ascend/" +"pull/1085)" + +#: ../../user_guide/release_notes.md:104 +msgid "" +"From this release, binary wheel package will be released as well. [#775]" +"(https://github.com/vllm-project/vllm-ascend/pull/775)" +msgstr "" +"从本版本开始,将同时发布二进制 wheel 包。[#775](https://github.com/vllm-" +"project/vllm-ascend/pull/775)" + +#: ../../user_guide/release_notes.md:105 +msgid "" +"The contributor doc site is [added](https://vllm-ascend.readthedocs.io/en/" +"latest/community/contributors.html)" +msgstr "" +"贡献者文档站点已[添加](https://vllm-ascend.readthedocs.io/en/latest/" +"community/contributors.html)" + +#: ../../user_guide/release_notes.md:107 +msgid "Known Issue" +msgstr "已知问题" + +#: ../../user_guide/release_notes.md:109 +msgid "" +"In some case, vLLM process may be crashed with aclgraph enabled. We're " +"working this issue and it'll be fixed in the next release." +msgstr "" +"在某些情况下,启用 aclgraph 时 vLLM 进程可能会崩溃。我们正在处理这个问题,并" +"将在下一个版本中修复。" + +#: ../../user_guide/release_notes.md:110 +msgid "" +"Multi node data-parallel doesn't work with this release. This is a known " +"issue in vllm and has been fixed on main branch. [#18981](https://github." +"com/vllm-project/vllm/pull/18981)" +msgstr "" +"多节点数据并行在此版本中无法使用。这是 vllm 中已知的问题,并已在主分支中修" +"复。 [#18981](https://github.com/vllm-project/vllm/pull/18981)" + +#: ../../user_guide/release_notes.md:112 +msgid "v0.7.3.post1 - 2025.05.29" +msgstr "v0.7.3.post1 - 2025.05.29" + +#: ../../user_guide/release_notes.md:114 +msgid "" +"This is the first post release of 0.7.3. Please follow the [official doc]" +"(https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. It " +"includes the following changes:" +msgstr "" +"这是 0.7.3 的第一个补丁发布。请按照[官方文档](https://vllm-ascend." +"readthedocs.io/en/v0.7.3-dev)开始使用。本次更新包括以下更改:" + +#: ../../user_guide/release_notes.md:118 +msgid "" +"Qwen3 and Qwen3MOE is supported now. The performance and accuracy of Qwen3 " +"is well tested. You can try it now. Mindie Turbo is recomanded to improve " +"the performance of Qwen3. [#903](https://github.com/vllm-project/vllm-" +"ascend/pull/903) [#915](https://github.com/vllm-project/vllm-ascend/" +"pull/915)" +msgstr "" +"现在已支持 Qwen3 和 Qwen3MOE。Qwen3 的性能和精度已经过充分测试,你可以立即试" +"用。推荐使用 Mindie Turbo 以提升 Qwen3 的性能。[#903](https://github.com/" +"vllm-project/vllm-ascend/pull/903) [#915](https://github.com/vllm-project/" +"vllm-ascend/pull/915)" + +#: ../../user_guide/release_notes.md:119 +msgid "" +"Added a new performance guide. The guide aims to help users to improve vllm-" +"ascend performance on system level. It includes OS configuration, library " +"optimization, deploy guide and so on. [#878](https://github.com/vllm-" +"project/vllm-ascend/pull/878) [Doc Link](https://vllm-ascend.readthedocs.io/" +"en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html)" +msgstr "" +"新增了一个性能指南。该指南旨在帮助用户在系统层面提升 vllm-ascend 的性能。内" +"容包括操作系统配置、库优化、部署指南等。 [#878](https://github.com/vllm-" +"project/vllm-ascend/pull/878) [文档链接](https://vllm-ascend.readthedocs.io/" +"en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html)" + +#: ../../user_guide/release_notes.md:121 +msgid "Bug Fix" +msgstr "漏洞修复" + +#: ../../user_guide/release_notes.md:123 +msgid "" +"Qwen2.5-VL works for RLHF scenarios now. [#928](https://github.com/vllm-" +"project/vllm-ascend/pull/928)" +msgstr "" +"Qwen2.5-VL 现在已支持 RLHF 场景。[#928](https://github.com/vllm-project/" +"vllm-ascend/pull/928)" + +#: ../../user_guide/release_notes.md:124 +msgid "" +"Users can launch the model from online weights now. e.g. from huggingface " +"or modelscope directly [#858](https://github.com/vllm-project/vllm-ascend/" +"pull/858) [#918](https://github.com/vllm-project/vllm-ascend/pull/918)" +msgstr "" +"用户现在可以直接从在线权重启动模型。例如,可以直接从 huggingface 或 " +"modelscope 获取。[#858](https://github.com/vllm-project/vllm-ascend/" +"pull/858) [#918](https://github.com/vllm-project/vllm-ascend/pull/918)" + +#: ../../user_guide/release_notes.md:125 +msgid "" +"The meaningless log info `UserWorkspaceSize0` has been cleaned. [#911]" +"(https://github.com/vllm-project/vllm-ascend/pull/911)" +msgstr "" +"无意义的日志信息 `UserWorkspaceSize0` 已被清理。[#911](https://github.com/" +"vllm-project/vllm-ascend/pull/911)" + +#: ../../user_guide/release_notes.md:126 +msgid "" +"The log level for `Failed to import vllm_ascend_C` has been changed to " +"`warning` instead of `error`. [#956](https://github.com/vllm-project/vllm-" +"ascend/pull/956)" +msgstr "" +"`Failed to import vllm_ascend_C` 的日志级别已从 `error` 更改为 `warning`。" +"[#956](https://github.com/vllm-project/vllm-ascend/pull/956)" + +#: ../../user_guide/release_notes.md:127 +msgid "" +"DeepSeek MLA now works with chunked prefill in V1 Engine. Please note that " +"V1 engine in 0.7.3 is just expermential and only for test usage. [#849]" +"(https://github.com/vllm-project/vllm-ascend/pull/849) [#936](https://" +"github.com/vllm-project/vllm-ascend/pull/936)" +msgstr "" +"DeepSeek MLA 现已在 V1 引擎中支持分块预填充。请注意,0.7.3 版本中的 V1 引擎" +"仅为实验性,仅供测试使用。[#849](https://github.com/vllm-project/vllm-" +"ascend/pull/849) [#936](https://github.com/vllm-project/vllm-ascend/" +"pull/936)" + +#: ../../user_guide/release_notes.md:129 +msgid "Docs" +msgstr "文档" + +#: ../../user_guide/release_notes.md:131 +msgid "" +"The benchmark doc is updated for Qwen2.5 and Qwen2.5-VL [#792](https://" +"github.com/vllm-project/vllm-ascend/pull/792)" +msgstr "" +"基准文档已针对 Qwen2.5 和 Qwen2.5-VL 更新 [#792](https://github.com/vllm-" +"project/vllm-ascend/pull/792)" + +#: ../../user_guide/release_notes.md:132 +msgid "" +"Add the note to clear that only \"modelscope<1.23.0\" works with 0.7.3. " +"[#954](https://github.com/vllm-project/vllm-ascend/pull/954)" +msgstr "" +"添加说明,明确只有 \"modelscope<1.23.0\" 能与 0.7.3 一起使用。[#954]" +"(https://github.com/vllm-project/vllm-ascend/pull/954)" + +#: ../../user_guide/release_notes.md:134 +msgid "v0.7.3 - 2025.05.08" +msgstr "v0.7.3 - 2025.05.08" + +#: ../../user_guide/release_notes.md:136 ../../user_guide/release_notes.md:277 +msgid "🎉 Hello, World!" +msgstr "🎉 你好,世界!" + +#: ../../user_guide/release_notes.md:138 +msgid "" +"We are excited to announce the release of 0.7.3 for vllm-ascend. This is " +"the first official release. The functionality, performance, and stability " +"of this release are fully tested and verified. We encourage you to try it " +"out and provide feedback. We'll post bug fix versions in the future if " +"needed. Please follow the [official doc](https://vllm-ascend.readthedocs.io/" +"en/v0.7.3-dev) to start the journey." +msgstr "" +"我们很高兴地宣布 vllm-ascend 0.7.3 版本正式发布。这是首个正式发布的版本。该" +"版本的功能、性能和稳定性已充分测试和验证。我们鼓励您试用并反馈意见。如有需" +"要,未来我们将发布修复版本。请参阅[官方文档](https://vllm-ascend." +"readthedocs.io/en/v0.7.3-dev)开启您的体验之旅。" + +#: ../../user_guide/release_notes.md:141 +msgid "" +"This release includes all features landed in the previous release " +"candidates ([v0.7.1rc1](https://github.com/vllm-project/vllm-ascend/" +"releases/tag/v0.7.1rc1), [v0.7.3rc1](https://github.com/vllm-project/vllm-" +"ascend/releases/tag/v0.7.3rc1), [v0.7.3rc2](https://github.com/vllm-project/" +"vllm-ascend/releases/tag/v0.7.3rc2)). And all the features are fully tested " +"and verified. Visit the official doc the get the detail [feature](https://" +"vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html) " +"and [model](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/" +"supported_models.html) support matrix." +msgstr "" +"本次发布包含了所有在之前候选版本中加入的功能([v0.7.1rc1](https://github." +"com/vllm-project/vllm-ascend/releases/tag/v0.7.1rc1)、[v0.7.3rc1](https://" +"github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc1)、[v0.7.3rc2]" +"(https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc2))。所有" +"功能都经过了全面测试和验证。请访问官方文档获取详细的[功能](https://vllm-" +"ascend.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html)和[模" +"型](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/" +"supported_models.html)支持矩阵。" + +#: ../../user_guide/release_notes.md:142 +msgid "" +"Upgrade CANN to 8.1.RC1 to enable chunked prefill and automatic prefix " +"caching features. You can now enable them now." +msgstr "" +"将 CANN 升级到 8.1.RC1 以启用分块预填充和自动前缀缓存功能。您现在可以启用这" +"些功能了。" + +#: ../../user_guide/release_notes.md:143 +msgid "" +"Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version " +"of torch-npu now. Now users don't need to install the torch-npu by hand. " +"The 2.5.1 version of torch-npu will be installed automatically. [#662]" +"(https://github.com/vllm-project/vllm-ascend/pull/662)" +msgstr "" +"升级 PyTorch 至 2.5.1。vLLM Ascend 现在不再依赖于 torch-npu 的开发版本。用户" +"现在无需手动安装 torch-npu,2.5.1 版本的 torch-npu 会被自动安装。[#662]" +"(https://github.com/vllm-project/vllm-ascend/pull/662)" + +#: ../../user_guide/release_notes.md:144 +msgid "" +"Integrate MindIE Turbo into vLLM Ascend to improve DeepSeek V3/R1, Qwen 2 " +"series performance. [#708](https://github.com/vllm-project/vllm-ascend/" +"pull/708)" +msgstr "" +"将 MindIE Turbo 集成到 vLLM Ascend 以提升 DeepSeek V3/R1、Qwen 2 系列的性" +"能。[#708](https://github.com/vllm-project/vllm-ascend/pull/708)" + +#: ../../user_guide/release_notes.md:147 +msgid "" +"LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will " +"be improved in the next release. Please follow the official doc for more " +"usage information. Thanks for the contribution from China Merchants Bank. " +"[#700](https://github.com/vllm-project/vllm-ascend/pull/700)" +msgstr "" +"现在已经支持 LoRA、多LoRA 和动态服务。下一个版本中性能将会提升。请参阅官方文" +"档以获取更多用法信息。感谢招商银行的贡献。[#700](https://github.com/vllm-" +"project/vllm-ascend/pull/700)" + +#: ../../user_guide/release_notes.md:150 +msgid "" +"The performance of Qwen2 vl and Qwen2.5 vl is improved. [#702](https://" +"github.com/vllm-project/vllm-ascend/pull/702)" +msgstr "" +"Qwen2 vl 和 Qwen2.5 vl 的性能得到了提升。 [#702](https://github.com/vllm-" +"project/vllm-ascend/pull/702)" + +#: ../../user_guide/release_notes.md:151 +msgid "" +"The performance of `apply_penalties` and `topKtopP` ops are improved. [#525]" +"(https://github.com/vllm-project/vllm-ascend/pull/525)" +msgstr "" +"`apply_penalties` 和 `topKtopP` 操作的性能得到了提升。 [#525](https://" +"github.com/vllm-project/vllm-ascend/pull/525)" + +#: ../../user_guide/release_notes.md:154 +msgid "" +"Fixed a issue that may lead CPU memory leak. [#691](https://github.com/vllm-" +"project/vllm-ascend/pull/691) [#712](https://github.com/vllm-project/vllm-" +"ascend/pull/712)" +msgstr "" +"修复了可能导致CPU内存泄漏的问题。 [#691](https://github.com/vllm-project/" +"vllm-ascend/pull/691) [#712](https://github.com/vllm-project/vllm-ascend/" +"pull/712)" + +#: ../../user_guide/release_notes.md:155 +msgid "" +"A new environment `SOC_VERSION` is added. If you hit any soc detection " +"error when building with custom ops enabled, please set `SOC_VERSION` to a " +"suitable value. [#606](https://github.com/vllm-project/vllm-ascend/pull/606)" +msgstr "" +"新增了一个环境变量 `SOC_VERSION`。如果在启用自定义算子时构建过程中遇到 soc " +"检测错误,请将 `SOC_VERSION` 设置为合适的值。[#606](https://github.com/vllm-" +"project/vllm-ascend/pull/606)" + +#: ../../user_guide/release_notes.md:156 +msgid "" +"openEuler container image supported with v0.7.3-openeuler tag. [#665]" +"(https://github.com/vllm-project/vllm-ascend/pull/665)" +msgstr "" +"openEuler 容器镜像已支持 v0.7.3-openeuler 标签。[#665](https://github.com/" +"vllm-project/vllm-ascend/pull/665)" + +#: ../../user_guide/release_notes.md:157 +msgid "" +"Prefix cache feature works on V1 engine now. [#559](https://github.com/vllm-" +"project/vllm-ascend/pull/559)" +msgstr "" +"前缀缓存功能现在已在 V1 引擎上工作。[#559](https://github.com/vllm-project/" +"vllm-ascend/pull/559)" + +#: ../../user_guide/release_notes.md:159 +msgid "v0.8.5rc1 - 2025.05.06" +msgstr "v0.8.5rc1 - 2025.05.06" + +#: ../../user_guide/release_notes.md:161 +msgid "" +"This is the 1st release candidate of v0.8.5 for vllm-ascend. Please follow " +"the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the " +"journey. Now you can enable V1 egnine by setting the environment variable " +"`VLLM_USE_V1=1`, see the feature support status of vLLM Ascend in [here]" +"(https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/" +"supported_features.html)." +msgstr "" +"这是 vllm-ascend v0.8.5 的第一个候选发布版本。请按照[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。现在,您可以通过设置环境变量 " +"`VLLM_USE_V1=1` 启用 V1 引擎。关于 vLLM Ascend 的特性支持情况,请参见[这里]" +"(https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/" +"supported_features.html)。" + +#: ../../user_guide/release_notes.md:164 +msgid "" +"Upgrade CANN version to 8.1.RC1 to support chunked prefill and automatic " +"prefix caching (`--enable_prefix_caching`) when V1 is enabled [#747]" +"(https://github.com/vllm-project/vllm-ascend/pull/747)" +msgstr "" +"将 CANN 版本升级到 8.1.RC1,以支持在启用 V1 时的分块预填充和自动前缀缓存" +"(`--enable_prefix_caching`)[#747](https://github.com/vllm-project/vllm-" +"ascend/pull/747)" + +#: ../../user_guide/release_notes.md:165 +msgid "" +"Optimize Qwen2 VL and Qwen 2.5 VL [#701](https://github.com/vllm-project/" +"vllm-ascend/pull/701)" +msgstr "" +"优化 Qwen2 VL 和 Qwen 2.5 VL [#701](https://github.com/vllm-project/vllm-" +"ascend/pull/701)" + +#: ../../user_guide/release_notes.md:166 +#, python-brace-format +msgid "" +"Improve Deepseek V3 eager mode and graph mode performance, now you can use " +"--additional_config={'enable_graph_mode': True} to enable graph mode. [#598]" +"(https://github.com/vllm-project/vllm-ascend/pull/598) [#719](https://" +"github.com/vllm-project/vllm-ascend/pull/719)" +msgstr "" +"改进了 Deepseek V3 的 eager 模式和图模式性能,现在你可以使用 --" +"additional_config={'enable_graph_mode': True} 来启用图模式。[#598](https://" +"github.com/vllm-project/vllm-ascend/pull/598) [#719](https://github.com/" +"vllm-project/vllm-ascend/pull/719)" + +#: ../../user_guide/release_notes.md:169 +msgid "" +"Upgrade vLLM to 0.8.5.post1 [#715](https://github.com/vllm-project/vllm-" +"ascend/pull/715)" +msgstr "" +"将 vLLM 升级到 0.8.5.post1 [#715](https://github.com/vllm-project/vllm-" +"ascend/pull/715)" + +#: ../../user_guide/release_notes.md:170 +msgid "" +"Fix early return in CustomDeepseekV2MoE.forward during profile_run [#682]" +"(https://github.com/vllm-project/vllm-ascend/pull/682)" +msgstr "" +"修复在 profile_run 期间 CustomDeepseekV2MoE.forward 过早返回的问题 [#682]" +"(https://github.com/vllm-project/vllm-ascend/pull/682)" + +#: ../../user_guide/release_notes.md:171 +msgid "" +"Adapts for new quant model generated by modelslim [#719](https://github.com/" +"vllm-project/vllm-ascend/pull/719)" +msgstr "" +"适配由 modelslim 生成的新量化模型 [#719](https://github.com/vllm-project/" +"vllm-ascend/pull/719)" + +#: ../../user_guide/release_notes.md:172 +msgid "" +"Initial support on P2P Disaggregated Prefill based on llm_datadist [#694]" +"(https://github.com/vllm-project/vllm-ascend/pull/694)" +msgstr "" +"基于 llm_datadist 的 P2P 分布式 Prefill 初步支持 [#694](https://github.com/" +"vllm-project/vllm-ascend/pull/694)" + +#: ../../user_guide/release_notes.md:173 +msgid "" +"Use `/vllm-workspace` as code path and include `.git` in container image to " +"fix issue when start vllm under `/workspace` [#726](https://github.com/vllm-" +"project/vllm-ascend/pull/726)" +msgstr "" +"使用 `/vllm-workspace` 作为代码路径,并在容器镜像中包含 `.git` ,以修复在 `/" +"workspace` 下启动 vllm 时的问题 [#726](https://github.com/vllm-project/vllm-" +"ascend/pull/726)" + +#: ../../user_guide/release_notes.md:174 +msgid "" +"Optimize NPU memory usage to make DeepSeek R1 W8A8 32K model len work. " +"[#728](https://github.com/vllm-project/vllm-ascend/pull/728)" +msgstr "" +"优化NPU内存使用,以使 DeepSeek R1 W8A8 32K 模型长度能够运行。[#728](https://" +"github.com/vllm-project/vllm-ascend/pull/728)" + +#: ../../user_guide/release_notes.md:175 +msgid "" +"Fix `PYTHON_INCLUDE_PATH` typo in setup.py [#762](https://github.com/vllm-" +"project/vllm-ascend/pull/762)" +msgstr "" +"修复 setup.py 中的 `PYTHON_INCLUDE_PATH` 拼写错误 [#762](https://github.com/" +"vllm-project/vllm-ascend/pull/762)" + +#: ../../user_guide/release_notes.md:178 +msgid "" +"Add Qwen3-0.6B test [#717](https://github.com/vllm-project/vllm-ascend/" +"pull/717)" +msgstr "" +"添加 Qwen3-0.6B 测试 [#717](https://github.com/vllm-project/vllm-ascend/" +"pull/717)" + +#: ../../user_guide/release_notes.md:179 +msgid "" +"Add nightly CI [#668](https://github.com/vllm-project/vllm-ascend/pull/668)" +msgstr "" +"添加每晚持续集成 [#668](https://github.com/vllm-project/vllm-ascend/" +"pull/668)" + +#: ../../user_guide/release_notes.md:180 +msgid "" +"Add accuracy test report [#542](https://github.com/vllm-project/vllm-ascend/" +"pull/542)" +msgstr "" +"添加准确性测试报告 [#542](https://github.com/vllm-project/vllm-ascend/" +"pull/542)" + +#: ../../user_guide/release_notes.md:182 +msgid "v0.8.4rc2 - 2025.04.29" +msgstr "v0.8.4rc2 - 2025.04.29" + +#: ../../user_guide/release_notes.md:184 +msgid "" +"This is the second release candidate of v0.8.4 for vllm-ascend. Please " +"follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start " +"the journey. Some experimental features are included in this version, such " +"as W8A8 quantization and EP/DP support. We'll make them stable enough in " +"the next release." +msgstr "" +"这是 vllm-ascend 的 v0.8.4 第二个候选版本。请按照[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。本版本包含了一些实验性功能,如 W8A8 量化" +"和 EP/DP 支持。我们将在下一个版本中使这些功能更加稳定。" + +#: ../../user_guide/release_notes.md:187 +msgid "" +"Qwen3 and Qwen3MOE is supported now. Please follow the [official doc]" +"(https://vllm-ascend.readthedocs.io/en/latest/tutorials/single_npu.html) to " +"run the quick demo. [#709](https://github.com/vllm-project/vllm-ascend/" +"pull/709)" +msgstr "" +"现在已支持 Qwen3 和 Qwen3MOE。请按照[官方文档](https://vllm-ascend." +"readthedocs.io/en/latest/tutorials/single_npu.html)运行快速演示。[#709]" +"(https://github.com/vllm-project/vllm-ascend/pull/709)" + +#: ../../user_guide/release_notes.md:188 +msgid "" +"Ascend W8A8 quantization method is supported now. Please take the [official " +"doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/" +"multi_npu_quantization.html) for example. Any [feedback](https://github.com/" +"vllm-project/vllm-ascend/issues/619) is welcome. [#580](https://github.com/" +"vllm-project/vllm-ascend/pull/580)" +msgstr "" +"现在支持 Ascend W8A8 量化方法。请参考[官方文档](https://vllm-ascend." +"readthedocs.io/en/latest/tutorials/multi_npu_quantization.html) 示例。欢迎提" +"供任何[反馈](https://github.com/vllm-project/vllm-ascend/issues/619)。[#580]" +"(https://github.com/vllm-project/vllm-ascend/pull/580)" + +#: ../../user_guide/release_notes.md:189 +msgid "" +"DeepSeek V3/R1 works with DP, TP and MTP now. Please note that it's still " +"in experimental status. Let us know if you hit any problem. [#429](https://" +"github.com/vllm-project/vllm-ascend/pull/429) [#585](https://github.com/" +"vllm-project/vllm-ascend/pull/585) [#626](https://github.com/vllm-project/" +"vllm-ascend/pull/626) [#636](https://github.com/vllm-project/vllm-ascend/" +"pull/636) [#671](https://github.com/vllm-project/vllm-ascend/pull/671)" +msgstr "" +"DeepSeek V3/R1 现在已经支持 DP、TP 和 MTP。请注意,目前仍处于实验阶段。如果" +"遇到任何问题,请告知我们。 [#429](https://github.com/vllm-project/vllm-" +"ascend/pull/429) [#585](https://github.com/vllm-project/vllm-ascend/" +"pull/585) [#626](https://github.com/vllm-project/vllm-ascend/pull/626) " +"[#636](https://github.com/vllm-project/vllm-ascend/pull/636) [#671](https://" +"github.com/vllm-project/vllm-ascend/pull/671)" + +#: ../../user_guide/release_notes.md:192 +msgid "" +"ACLGraph feature is supported with V1 engine now. It's disabled by default " +"because this feature rely on CANN 8.1 release. We'll make it available by " +"default in the next release [#426](https://github.com/vllm-project/vllm-" +"ascend/pull/426)" +msgstr "" +"ACLGraph 特性现在已被 V1 引擎支持。它默认是禁用的,因为该特性依赖于 CANN " +"8.1 版本。我们将在下一个版本中默认启用此特性 [#426](https://github.com/vllm-" +"project/vllm-ascend/pull/426)。" + +#: ../../user_guide/release_notes.md:193 +msgid "" +"Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version " +"of torch-npu now. Now users don't need to install the torch-npu by hand. " +"The 2.5.1 version of torch-npu will be installed automatically. [#661]" +"(https://github.com/vllm-project/vllm-ascend/pull/661)" +msgstr "" +"升级 PyTorch 至 2.5.1。vLLM Ascend 现在不再依赖 dev 版本的 torch-npu,用户无" +"需手动安装 torch-npu。torch-npu 的 2.5.1 版本将会自动安装。[#661](https://" +"github.com/vllm-project/vllm-ascend/pull/661)" + +#: ../../user_guide/release_notes.md:196 +msgid "" +"MiniCPM model works now. [#645](https://github.com/vllm-project/vllm-ascend/" +"pull/645)" +msgstr "" +"MiniCPM 模型现在可以使用了。[#645](https://github.com/vllm-project/vllm-" +"ascend/pull/645)" + +#: ../../user_guide/release_notes.md:197 +msgid "" +"openEuler container image supported with `v0.8.4-openeuler` tag and customs " +"Ops build is enabled by default for openEuler OS. [#689](https://github.com/" +"vllm-project/vllm-ascend/pull/689)" +msgstr "" +"openEuler 容器镜像已支持 `v0.8.4-openeuler` 标签,并且 openEuler 操作系统默" +"认启用了自定义 Ops 构建。[#689](https://github.com/vllm-project/vllm-ascend/" +"pull/689)" + +#: ../../user_guide/release_notes.md:198 +msgid "" +"Fix ModuleNotFoundError bug to make Lora work [#600](https://github.com/" +"vllm-project/vllm-ascend/pull/600)" +msgstr "" +"修复 ModuleNotFoundError 错误以使 Lora 正常工作 [#600](https://github.com/" +"vllm-project/vllm-ascend/pull/600)" + +#: ../../user_guide/release_notes.md:199 +msgid "" +"Add \"Using EvalScope evaluation\" doc [#611](https://github.com/vllm-" +"project/vllm-ascend/pull/611)" +msgstr "" +"添加了“使用 EvalScope 评估”文档 [#611](https://github.com/vllm-project/vllm-" +"ascend/pull/611)" + +#: ../../user_guide/release_notes.md:200 +msgid "" +"Add a `VLLM_VERSION` environment to make vLLM version configurable to help " +"developer set correct vLLM version if the code of vLLM is changed by hand " +"locally. [#651](https://github.com/vllm-project/vllm-ascend/pull/651)" +msgstr "" +"新增了一个 `VLLM_VERSION` 环境变量,使 vLLM 版本可以配置,帮助开发者在本地手" +"动修改 vLLM 代码后,设置正确的 vLLM 版本。[#651](https://github.com/vllm-" +"project/vllm-ascend/pull/651)" + +#: ../../user_guide/release_notes.md:202 +msgid "v0.8.4rc1 - 2025.04.18" +msgstr "v0.8.4rc1 - 2025.04.18" + +#: ../../user_guide/release_notes.md:204 +msgid "" +"This is the first release candidate of v0.8.4 for vllm-ascend. Please " +"follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start " +"the journey. From this version, vllm-ascend will follow the newest version " +"of vllm and release every two weeks. For example, if vllm releases v0.8.5 " +"in the next two weeks, vllm-ascend will release v0.8.5rc1 instead of " +"v0.8.4rc2. Please find the detail from the [official documentation](https://" +"vllm-ascend.readthedocs.io/en/latest/community/versioning_policy." +"html#release-window)." +msgstr "" +"这是 vllm-ascend v0.8.4 的第一个候选发布版本。请按照[官方文档](https://vllm-" +"ascend.readthedocs.io/en/)开始使用。本版本起,vllm-ascend 将跟随 vllm 的最新" +"版本并每两周发布一次。例如,如果 vllm 在接下来的两周内发布 v0.8.5,vllm-" +"ascend 将发布 v0.8.5rc1,而不是 v0.8.4rc2。详细信息请参考[官方文档](https://" +"vllm-ascend.readthedocs.io/en/latest/community/versioning_policy." +"html#release-window)。" + +#: ../../user_guide/release_notes.md:208 +msgid "" +"vLLM V1 engine experimental support is included in this version. You can " +"visit [official guide](https://docs.vllm.ai/en/latest/getting_started/" +"v1_user_guide.html) to get more detail. By default, vLLM will fallback to " +"V0 if V1 doesn't work, please set `VLLM_USE_V1=1` environment if you want " +"to use V1 forcely." +msgstr "" +"本版本包含了对 vLLM V1 引擎的实验性支持。你可以访问[官方指南](https://docs." +"vllm.ai/en/latest/getting_started/v1_user_guide.html)获取更多详细信息。默认" +"情况下,如果 V1 不可用,vLLM 会自动回退到 V0。如果你想强制使用 V1,请设置 " +"`VLLM_USE_V1=1` 环境变量。" + +#: ../../user_guide/release_notes.md:209 +msgid "" +"LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will " +"be improved in the next release. Please follow the [official doc](https://" +"docs.vllm.ai/en/latest/features/lora.html) for more usage information. " +"Thanks for the contribution from China Merchants Bank. [#521](https://" +"github.com/vllm-project/vllm-ascend/pull/521)." +msgstr "" +"现在已支持 LoRA、Multi-LoRA 和动态服务。性能将在下一个版本中得到提升。请参阅" +"[官方文档](https://docs.vllm.ai/en/latest/features/lora.html)获取更多使用信" +"息。感谢招商银行的贡献。[#521](https://github.com/vllm-project/vllm-ascend/" +"pull/521)。" + +#: ../../user_guide/release_notes.md:210 +msgid "" +"Sleep Mode feature is supported. Currently it's only work on V0 engine. V1 " +"engine support will come soon. [#513](https://github.com/vllm-project/vllm-" +"ascend/pull/513)" +msgstr "" +"已支持休眠模式功能。目前它只在V0引擎上有效,V1引擎的支持即将到来。[#513]" +"(https://github.com/vllm-project/vllm-ascend/pull/513)" + +#: ../../user_guide/release_notes.md:214 +msgid "" +"The Ascend scheduler is added for V1 engine. This scheduler is more " +"affinity with Ascend hardware. More scheduler policy will be added in the " +"future. [#543](https://github.com/vllm-project/vllm-ascend/pull/543)" +msgstr "" +"为V1引擎新增了Ascend调度器。该调度器与Ascend硬件更加适配。未来还将添加更多调" +"度策略。 [#543](https://github.com/vllm-project/vllm-ascend/pull/543)" + +#: ../../user_guide/release_notes.md:215 +msgid "" +"Disaggregated Prefill feature is supported. Currently only 1P1D works. NPND " +"is under design by vllm team. vllm-ascend will support it once it's ready " +"from vLLM. Follow the [official guide](https://docs.vllm.ai/en/latest/" +"features/disagg_prefill.html) to use. [#432](https://github.com/vllm-" +"project/vllm-ascend/pull/432)" +msgstr "" +"支持分离式预填充(Disaggregated Prefill)功能。目前仅支持1P1D,NPND正在由" +"vllm团队设计中。一旦vLLM支持,vllm-ascend将会支持。请按照[官方指南](https://" +"docs.vllm.ai/en/latest/features/disagg_prefill.html)使用。[#432](https://" +"github.com/vllm-project/vllm-ascend/pull/432)" + +#: ../../user_guide/release_notes.md:216 +msgid "" +"Spec decode feature works now. Currently it's only work on V0 engine. V1 " +"engine support will come soon. [#500](https://github.com/vllm-project/vllm-" +"ascend/pull/500)" +msgstr "" +"Spec 解码功能现在可以使用。目前它只在 V0 引擎上工作,对 V1 引擎的支持即将到" +"来。[#500](https://github.com/vllm-project/vllm-ascend/pull/500)" + +#: ../../user_guide/release_notes.md:217 +msgid "" +"Structured output feature works now on V1 Engine. Currently it only " +"supports xgrammar backend while using guidance backend may get some errors. " +"[#555](https://github.com/vllm-project/vllm-ascend/pull/555)" +msgstr "" +"结构化输出功能现在已在V1引擎上生效。目前仅支持xgrammar后端,使用guidance后端" +"可能会出现一些错误。[#555](https://github.com/vllm-project/vllm-ascend/" +"pull/555)" + +#: ../../user_guide/release_notes.md:221 +msgid "" +"A new communicator `pyhccl` is added. It's used for call CANN HCCL library " +"directly instead of using `torch.distribute`. More usage of it will be " +"added in the next release [#503](https://github.com/vllm-project/vllm-" +"ascend/pull/503)" +msgstr "" +"新增了一个通信器 `pyhccl`。它用于直接调用 CANN HCCL 库,而不是使用 `torch." +"distribute`。将在下一个版本中添加更多用法 [#503](https://github.com/vllm-" +"project/vllm-ascend/pull/503)。" + +#: ../../user_guide/release_notes.md:222 +msgid "" +"The custom ops build is enabled by default. You should install the packages " +"like `gcc`, `cmake` first to build `vllm-ascend` from source. Set " +"`COMPILE_CUSTOM_KERNELS=0` environment to disable the compilation if you " +"don't need it. [#466](https://github.com/vllm-project/vllm-ascend/pull/466)" +msgstr "" +"自定义算子的构建默认是启用的。你应该先安装如 `gcc`、`cmake` 等包以便从源码编" +"译 `vllm-ascend`。如果不需要自定义算子的编译,可以设置环境变量 " +"`COMPILE_CUSTOM_KERNELS=0` 来禁用编译。 [#466](https://github.com/vllm-" +"project/vllm-ascend/pull/466)" + +#: ../../user_guide/release_notes.md:223 +msgid "" +"The custom op `rotay embedding` is enabled by default now to improve the " +"performance. [#555](https://github.com/vllm-project/vllm-ascend/pull/555)" +msgstr "" +"自定义算子 `rotay embedding` 现在已默认启用,以提升性能。[#555](https://" +"github.com/vllm-project/vllm-ascend/pull/555)" + +#: ../../user_guide/release_notes.md:225 +msgid "v0.7.3rc2 - 2025.03.29" +msgstr "v0.7.3rc2 - 2025.03.29" + +#: ../../user_guide/release_notes.md:227 +msgid "" +"This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the " +"[official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start " +"the journey." +msgstr "" +"这是 vllm-ascend v0.7.3 的第二个候选发布版本。请根据[官方文档](https://vllm-" +"ascend.readthedocs.io/en/v0.7.3-dev)开始使用。" + +#: ../../user_guide/release_notes.md:228 ../../user_guide/release_notes.md:250 +msgid "" +"Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/" +"quick_start.html" +msgstr "" +"容器快速入门: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start." +"html" + +#: ../../user_guide/release_notes.md:229 ../../user_guide/release_notes.md:251 +msgid "" +"Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation." +"html" +msgstr "" +"安装: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html" + +#: ../../user_guide/release_notes.md:232 +msgid "" +"Add Ascend Custom Ops framewrok. Developers now can write customs ops using " +"AscendC. An example ops `rotary_embedding` is added. More tutorials will " +"come soon. The Custom Ops compilation is disabled by default when " +"installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371]" +"(https://github.com/vllm-project/vllm-ascend/pull/371)" +msgstr "" +"新增了Ascend自定义算子框架。开发者现在可以使用AscendC编写自定义算子。新增了" +"一个示例算子 `rotary_embedding` 。更多教程即将发布。安装vllm-ascend时,自定" +"义算子的编译默认是关闭的。可通过设置 `COMPILE_CUSTOM_KERNELS=1` 启用。[#371]" +"(https://github.com/vllm-project/vllm-ascend/pull/371)" + +#: ../../user_guide/release_notes.md:233 +msgid "" +"V1 engine is basic supported in this release. The full support will be done " +"in 0.8.X release. If you hit any issue or have any requirement of V1 " +"engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/" +"issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376)" +msgstr "" +"本版本对 V1 引擎提供了基础支持,全面支持将在 0.8.X 版本中完成。如果您遇到任" +"何问题或有 V1 引擎的相关需求,请在[这里](https://github.com/vllm-project/" +"vllm-ascend/issues/414)告诉我们。[#376](https://github.com/vllm-project/" +"vllm-ascend/pull/376)" + +#: ../../user_guide/release_notes.md:234 +msgid "" +"Prefix cache feature works now. You can set `enable_prefix_caching=True` to " +"enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282)" +msgstr "" +"前缀缓存功能现在已经可用。你可以通过设置 `enable_prefix_caching=True` 来启用" +"该功能。[#282](https://github.com/vllm-project/vllm-ascend/pull/282)" + +#: ../../user_guide/release_notes.md:237 +msgid "" +"Bump torch_npu version to dev20250320.3 to improve accuracy to fix `!!!` " +"output problem. [#406](https://github.com/vllm-project/vllm-ascend/pull/406)" +msgstr "" +"将 torch_npu 版本升级到 dev20250320.3 以提升精度,修复 `!!!` 输出问题。" +"[#406](https://github.com/vllm-project/vllm-ascend/pull/406)" + +#: ../../user_guide/release_notes.md:240 +msgid "" +"The performance of Qwen2-vl is improved by optimizing patch embedding " +"(Conv3D). [#398](https://github.com/vllm-project/vllm-ascend/pull/398)" +msgstr "" +"通过优化 patch embedding(Conv3D),Qwen2-vl 的性能得到了提升。[#398]" +"(https://github.com/vllm-project/vllm-ascend/pull/398)" + +#: ../../user_guide/release_notes.md:244 +msgid "" +"Fixed a bug to make sure multi step scheduler feature work. [#349](https://" +"github.com/vllm-project/vllm-ascend/pull/349)" +msgstr "" +"修复了一个错误,以确保多步调度器功能正常工作。[#349](https://github.com/" +"vllm-project/vllm-ascend/pull/349)" + +#: ../../user_guide/release_notes.md:245 +msgid "" +"Fixed a bug to make prefix cache feature works with correct accuracy. [#424]" +"(https://github.com/vllm-project/vllm-ascend/pull/424)" +msgstr "" +"修复了一个 bug,使前缀缓存功能能够以正确的准确性运行。[#424](https://github." +"com/vllm-project/vllm-ascend/pull/424)" + +#: ../../user_guide/release_notes.md:247 +msgid "v0.7.3rc1 - 2025.03.14" +msgstr "v0.7.3rc1 - 2025.03.14" + +#: ../../user_guide/release_notes.md:249 +msgid "" +"🎉 Hello, World! This is the first release candidate of v0.7.3 for vllm-" +"ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/" +"en/v0.7.3-dev) to start the journey." +msgstr "" +"🎉 你好,世界!这是 vllm-ascend v0.7.3 的第一个候选发布版本。请按照[官方文" +"档](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev)开始你的旅程。" + +#: ../../user_guide/release_notes.md:254 +msgid "" +"DeepSeek V3/R1 works well now. Read the [official guide](https://vllm-" +"ascend.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html) to start! " +"[#242](https://github.com/vllm-project/vllm-ascend/pull/242)" +msgstr "" +"DeepSeek V3/R1 现在运行良好。请阅读[官方指南](https://vllm-ascend." +"readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html)开始![#242](https://" +"github.com/vllm-project/vllm-ascend/pull/242)" + +#: ../../user_guide/release_notes.md:255 +msgid "" +"Speculative decoding feature is supported. [#252](https://github.com/vllm-" +"project/vllm-ascend/pull/252)" +msgstr "" +"已支持猜测性解码功能。[#252](https://github.com/vllm-project/vllm-ascend/" +"pull/252)" + +#: ../../user_guide/release_notes.md:256 +msgid "" +"Multi step scheduler feature is supported. [#300](https://github.com/vllm-" +"project/vllm-ascend/pull/300)" +msgstr "" +"已支持多步调度器功能。[#300](https://github.com/vllm-project/vllm-ascend/" +"pull/300)" + +#: ../../user_guide/release_notes.md:259 +msgid "" +"Bump torch_npu version to dev20250308.3 to improve `_exponential` accuracy" +msgstr "将 torch_npu 版本升级到 dev20250308.3,以提升 `_exponential` 的精度" + +#: ../../user_guide/release_notes.md:260 +msgid "" +"Added initial support for pooling models. Bert based model, such as `BAAI/" +"bge-base-en-v1.5` and `BAAI/bge-reranker-v2-m3` works now. [#229](https://" +"github.com/vllm-project/vllm-ascend/pull/229)" +msgstr "" +"新增了对池化模型的初步支持。现在支持 Bert 基础模型,如 `BAAI/bge-base-en-" +"v1.5` 和 `BAAI/bge-reranker-v2-m3`。 [#229](https://github.com/vllm-project/" +"vllm-ascend/pull/229)" + +#: ../../user_guide/release_notes.md:263 +msgid "" +"The performance of Qwen2-VL is improved. [#241](https://github.com/vllm-" +"project/vllm-ascend/pull/241)" +msgstr "" +"Qwen2-VL 的性能得到了提升。[#241](https://github.com/vllm-project/vllm-" +"ascend/pull/241)" + +#: ../../user_guide/release_notes.md:264 +msgid "" +"MiniCPM is now supported [#164](https://github.com/vllm-project/vllm-ascend/" +"pull/164)" +msgstr "" +"MiniCPM 现在已被支持 [#164](https://github.com/vllm-project/vllm-ascend/" +"pull/164)" + +#: ../../user_guide/release_notes.md:267 +msgid "" +"Support MTP(Multi-Token Prediction) for DeepSeek V3/R1 [#236](https://" +"github.com/vllm-project/vllm-ascend/pull/236)" +msgstr "" +"为 DeepSeek V3/R1 支持 MTP(多标记预测) [#236](https://github.com/vllm-" +"project/vllm-ascend/pull/236)" + +#: ../../user_guide/release_notes.md:268 +msgid "" +"[Docs] Added more model tutorials, include DeepSeek, QwQ, Qwen and Qwen " +"2.5VL. See the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-" +"dev/tutorials/index.html) for detail" +msgstr "" +"[文档] 增加了更多的模型教程,包括 DeepSeek、QwQ、Qwen 和 Qwen 2.5VL。详情请" +"参见[官方文档](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/" +"index.html)。" + +#: ../../user_guide/release_notes.md:269 +msgid "" +"Pin modelscope<1.23.0 on vLLM v0.7.3 to resolve: https://github.com/vllm-" +"project/vllm/pull/13807" +msgstr "" +"在 vLLM v0.7.3 上锁定 modelscope 版本低于 1.23.0,以解决:https://github." +"com/vllm-project/vllm/pull/13807" + +#: ../../user_guide/release_notes.md:271 ../../user_guide/release_notes.md:302 +msgid "Known issues" +msgstr "已知问题" + +#: ../../user_guide/release_notes.md:272 +msgid "" +"In [some cases](https://github.com/vllm-project/vllm-ascend/issues/324), " +"especially when the input/output is very long, the accuracy of output may " +"be incorrect. We are working on it. It'll be fixed in the next release." +msgstr "" +"在[某些情况下](https://github.com/vllm-project/vllm-ascend/issues/324),特别" +"是当输入或输出非常长时,输出的准确性可能会有误。我们正在解决这个问题。将在下" +"一个版本中修复。" + +#: ../../user_guide/release_notes.md:273 +msgid "" +"Improved and reduced the garbled code in model output. But if you still hit " +"the issue, try to change the generation config value, such as " +"`temperature`, and try again. There is also a knonwn issue shown below. Any " +"[feedback](https://github.com/vllm-project/vllm-ascend/issues/267) is " +"welcome. [#277](https://github.com/vllm-project/vllm-ascend/pull/277)" +msgstr "" +"改进并减少了模型输出中的乱码问题。但如果你仍然遇到该问题,请尝试更改生成配置" +"的参数,例如 `temperature`,然后再试一次。下面还列出了一个已知问题。欢迎提供" +"任何[反馈](https://github.com/vllm-project/vllm-ascend/issues/267)。[#277]" +"(https://github.com/vllm-project/vllm-ascend/pull/277)" + +#: ../../user_guide/release_notes.md:275 +msgid "v0.7.1rc1 - 2025.02.19" +msgstr "v0.7.1rc1 - 2025.02.19" + +#: ../../user_guide/release_notes.md:279 +msgid "" +"We are excited to announce the first release candidate of v0.7.1 for vllm-" +"ascend." +msgstr "我们很高兴地宣布 vllm-ascend v0.7.1 的第一个候选版本发布。" + +#: ../../user_guide/release_notes.md:281 +msgid "" +"vLLM Ascend Plugin (vllm-ascend) is a community maintained hardware plugin " +"for running vLLM on the Ascend NPU. With this release, users can now enjoy " +"the latest features and improvements of vLLM on the Ascend NPU." +msgstr "" +"vLLM Ascend 插件(vllm-ascend)是一个由社区维护的硬件插件,用于在 Ascend " +"NPU 上运行 vLLM。通过此版本,用户现在可以在 Ascend NPU 上享受到 vLLM 的最新" +"功能和改进。" + +#: ../../user_guide/release_notes.md:283 +msgid "" +"Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/" +"v0.7.1-dev) to start the journey. Note that this is a release candidate, " +"and there may be some bugs or issues. We appreciate your feedback and " +"suggestions [here](https://github.com/vllm-project/vllm-ascend/issues/19)" +msgstr "" +"请参阅[官方文档](https://vllm-ascend.readthedocs.io/en/v0.7.1-dev)开始您的体" +"验之旅。请注意,这是一个候选发布版本,可能会有一些漏洞或问题。我们非常欢迎您" +"在[这里](https://github.com/vllm-project/vllm-ascend/issues/19)提交反馈和建" +"议。" + +#: ../../user_guide/release_notes.md:287 +msgid "" +"Initial supports for Ascend NPU on vLLM. [#3](https://github.com/vllm-" +"project/vllm-ascend/pull/3)" +msgstr "" +"在 vLLM 上初步支持 Ascend NPU。[#3](https://github.com/vllm-project/vllm-" +"ascend/pull/3)" + +#: ../../user_guide/release_notes.md:288 +msgid "" +"DeepSeek is now supported. [#88](https://github.com/vllm-project/vllm-" +"ascend/pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68)" +msgstr "" +"现在已支持 DeepSeek。 [#88](https://github.com/vllm-project/vllm-ascend/" +"pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68)" + +#: ../../user_guide/release_notes.md:289 +msgid "" +"Qwen, Llama series and other popular models are also supported, you can see " +"more details in [here](https://vllm-ascend.readthedocs.io/en/latest/" +"user_guide/supported_models.html)." +msgstr "" +"Qwen、Llama 系列及其他流行的模型也受支持,更多详情可参见[这里](https://vllm-" +"ascend.readthedocs.io/en/latest/user_guide/supported_models.html)。" + +#: ../../user_guide/release_notes.md:293 +msgid "" +"Added the Ascend quantization config option, the implementation will coming " +"soon. [#7](https://github.com/vllm-project/vllm-ascend/pull/7) [#73]" +"(https://github.com/vllm-project/vllm-ascend/pull/73)" +msgstr "" +"新增了 Ascend 量化配置选项,具体实现即将推出。[#7](https://github.com/vllm-" +"project/vllm-ascend/pull/7) [#73](https://github.com/vllm-project/vllm-" +"ascend/pull/73)" + +#: ../../user_guide/release_notes.md:294 +msgid "" +"Add silu_and_mul and rope ops and add mix ops into attention layer. [#18]" +"(https://github.com/vllm-project/vllm-ascend/pull/18)" +msgstr "" +"添加 silu_and_mul 和 rope 操作,并将混合操作加入到 attention 层。 [#18]" +"(https://github.com/vllm-project/vllm-ascend/pull/18)" + +#: ../../user_guide/release_notes.md:298 +msgid "" +"[CI] Enable Ascend CI to actively monitor and improve quality for vLLM on " +"Ascend. [#3](https://github.com/vllm-project/vllm-ascend/pull/3)" +msgstr "" +"[CI] 启用 Ascend CI,主动监测并提升 vLLM 在 Ascend 上的质量。[#3](https://" +"github.com/vllm-project/vllm-ascend/pull/3)" + +#: ../../user_guide/release_notes.md:299 +msgid "" +"[Docker] Add vllm-ascend container image [#64](https://github.com/vllm-" +"project/vllm-ascend/pull/64)" +msgstr "" +"[Docker] 添加 vllm-ascend 容器镜像 [#64](https://github.com/vllm-project/" +"vllm-ascend/pull/64)" + +#: ../../user_guide/release_notes.md:300 +msgid "" +"[Docs] Add a [live doc](https://vllm-ascend.readthedocs.org) [#55](https://" +"github.com/vllm-project/vllm-ascend/pull/55)" +msgstr "" +"[文档] 添加了一个 [在线文档](https://vllm-ascend.readthedocs.org) [#55]" +"(https://github.com/vllm-project/vllm-ascend/pull/55)" + +#: ../../user_guide/release_notes.md:304 +msgid "" +"This release relies on an unreleased torch_npu version. It has been " +"installed within official container image already. Please [install](https://" +"vllm-ascend.readthedocs.io/en/v0.7.1rc1/installation.html) it manually if " +"you are using non-container environment." +msgstr "" +"此版本依赖于尚未发布的 torch_npu 版本。该版本已集成在官方容器镜像中。如果您" +"使用的是非容器环境,请[手动安装](https://vllm-ascend.readthedocs.io/en/" +"v0.7.1rc1/installation.html)。" + +#: ../../user_guide/release_notes.md:305 +msgid "" +"There are logs like `No platform detected, vLLM is running on " +"UnspecifiedPlatform` or `Failed to import from vllm._C with " +"ModuleNotFoundError(\"No module named 'vllm._C'\")` shown when running vllm-" +"ascend. It actually doesn't affect any functionality and performance. You " +"can just ignore it. And it has been fixed in this [PR](https://github.com/" +"vllm-project/vllm/pull/12432) which will be included in v0.7.3 soon." +msgstr "" +"在运行 vllm-ascend 时,会显示类似 `No platform detected, vLLM is running on " +"UnspecifiedPlatform` 或 `Failed to import from vllm._C with " +"ModuleNotFoundError(\"No module named 'vllm._C'\")` 的日志。这实际上不会影响" +"任何功能和性能,你可以直接忽略它。这个问题已在此 [PR](https://github.com/" +"vllm-project/vllm/pull/12432) 中修复,并很快会在 v0.7.3 版本中包含。" + +#: ../../user_guide/release_notes.md:306 +msgid "" +"There are logs like `# CPU blocks: 35064, # CPU blocks: 2730` shown when " +"running vllm-ascend which should be `# NPU blocks:` . It actually doesn't " +"affect any functionality and performance. You can just ignore it. And it " +"has been fixed in this [PR](https://github.com/vllm-project/vllm/" +"pull/13378) which will be included in v0.7.3 soon." +msgstr "" +"在运行 vllm-ascend 时,会显示类似 `# CPU blocks: 35064, # CPU blocks: 2730` " +"的日志,实际应该为 `# NPU blocks:`。这实际上不会影响任何功能和性能,你可以忽" +"略它。该问题已在这个 [PR](https://github.com/vllm-project/vllm/pull/13378) " +"中修复,并将在 v0.7.3 版本中包含。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po new file mode 100644 index 0000000..040132f --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/index.po @@ -0,0 +1,30 @@ +# Translations template for PROJECT. +# Copyright (C) 2025 ORGANIZATION +# This file is distributed under the same license as the PROJECT project. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PROJECT VERSION\n" +"Report-Msgid-Bugs-To: EMAIL@ADDRESS\n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/support_matrix/index.md:5 +msgid "Support Matrix" +msgstr "支持矩阵" + +#: ../../user_guide/support_matrix/index.md:1 +msgid "Features and models" +msgstr "特性与模型" + +#: ../../user_guide/support_matrix/index.md:3 +msgid "This section provides a detailed supported matrix by vLLM Ascend." +msgstr "本节提供了 vLLM Ascend 的详细支持矩阵。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po new file mode 100644 index 0000000..b0bfbbb --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_features.po @@ -0,0 +1,264 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/support_matrix/supported_features.md:1 +msgid "Feature Support" +msgstr "功能支持" + +#: ../../user_guide/support_matrix/supported_features.md:3 +msgid "" +"The feature support principle of vLLM Ascend is: **aligned with the vLLM**. " +"We are also actively collaborating with the community to accelerate support." +msgstr "vLLM Ascend 的特性支持原则是:**与 vLLM 保持一致**。我们也在积极与社区合作,加快支持进度。" + +#: ../../user_guide/support_matrix/supported_features.md:5 +msgid "" +"You can check the [support status of vLLM V1 Engine][v1_user_guide]. Below " +"is the feature support status of vLLM Ascend:" +msgstr "你可以查看 [vLLM V1 引擎的支持状态][v1_user_guide]。下面是 vLLM Ascend 的功能支持情况:" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Feature" +msgstr "特性" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "vLLM V0 Engine" +msgstr "vLLM V0 引擎" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "vLLM V1 Engine" +msgstr "vLLM V1 引擎" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Next Step" +msgstr "下一步" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Chunked Prefill" +msgstr "分块预填充" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🟢 Functional" +msgstr "🟢 功能性" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Functional, see detail note: [Chunked Prefill][cp]" +msgstr "功能性,详见说明:[分块预填充][cp]" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Automatic Prefix Caching" +msgstr "自动前缀缓存" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Functional, see detail note: [vllm-ascend#732][apc]" +msgstr "可用,请参见详细说明:[vllm-ascend#732][apc]" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "LoRA" +msgstr "LoRA" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "[vllm-ascend#396][multilora], [vllm-ascend#893][v1 multilora]" +msgstr "[vllm-ascend#396][multilora],[vllm-ascend#893][v1 multilora]" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Prompt adapter" +msgstr "提示适配器" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🔴 No plan" +msgstr "🔴 无计划" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "This feature has been deprecated by vllm." +msgstr "此功能已被 vllm 弃用。" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Speculative decoding" +msgstr "猜测式解码" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Basic support" +msgstr "基础支持" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Pooling" +msgstr "池化" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🟡 Planned" +msgstr "🟡 计划中" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "CI needed and adapting more models; V1 support rely on vLLM support." +msgstr "需要持续集成(CI)并适配更多模型;V1 的支持依赖于 vLLM 的支持。" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Enc-dec" +msgstr "Enc-dec(编码-解码)" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🔴 NO plan" +msgstr "🔴 没有计划" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Plan in 2025.06.30" +msgstr "2025.06.30 的计划" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Multi Modality" +msgstr "多模态" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "[Tutorial][multimodal], optimizing and adapting more models" +msgstr "[教程][multimodal],优化和适配更多模型" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "LogProbs" +msgstr "LogProbs" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "CI needed" +msgstr "需要持续集成(CI)" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Prompt logProbs" +msgstr "提示 logProbs" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Async output" +msgstr "异步输出" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Multi step scheduler" +msgstr "多步调度器" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🔴 Deprecated" +msgstr "🔴 已弃用" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "[vllm#8779][v1_rfc], replaced by [vLLM V1 Scheduler][v1_scheduler]" +msgstr "[vllm#8779][v1_rfc],已被 [vLLM V1 调度器][v1_scheduler] 替代" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Best of" +msgstr "精选" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "[vllm#13361][best_of], CI needed" +msgstr "[vllm#13361][best_of],需要持续集成(CI)" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Beam search" +msgstr "束搜索" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Guided Decoding" +msgstr "引导解码" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "[vllm-ascend#177][guided_decoding]" +msgstr "[vllm-ascend#177][guided_decoding]" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Tensor Parallel" +msgstr "张量并行" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Pipeline Parallel" +msgstr "流水线并行" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Expert Parallel" +msgstr "专家并行" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "CI needed; No plan on V0 support" +msgstr "需要持续集成;没有支持V0的计划" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Data Parallel" +msgstr "数据并行" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "CI needed; No plan on V0 support" +msgstr "需要 CI;暂无 V0 支持计划" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Prefill Decode Disaggregation" +msgstr "预填充 解码 拆分" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "1P1D available, working on xPyD and V1 support." +msgstr "1P1D 已可用,正在开发 xPyD 和 V1 支持。" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Quantization" +msgstr "量化" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "W8A8 available, CI needed; working on more quantization method support" +msgstr "W8A8 已可用,需要持续集成(CI);正在开发对更多量化方法的支持。" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Graph Mode" +msgstr "图模式" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "🔵 Experimental" +msgstr "🔵 实验性" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Experimental, see detail note: [vllm-ascend#767][graph_mode]" +msgstr "实验性功能,详见说明:[vllm-ascend#767][graph_mode]" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "Sleep Mode" +msgstr "睡眠模式" + +#: ../../user_guide/support_matrix/supported_features.md +msgid "level=1 available, CI needed, working on V1 support" +msgstr "level=1 可用,需要CI,正在开发 V1 支持" + +#: ../../user_guide/support_matrix/supported_features.md:33 +msgid "🟢 Functional: Fully operational, with ongoing optimizations." +msgstr "🟢 功能性:完全可用,正在持续优化中。" + +#: ../../user_guide/support_matrix/supported_features.md:34 +msgid "" +"🔵 Experimental: Experimental support, interfaces and functions may change." +msgstr "🔵 实验性:实验性支持,接口和功能可能会发生变化。" + +#: ../../user_guide/support_matrix/supported_features.md:35 +msgid "🚧 WIP: Under active development, will be supported soon." +msgstr "🚧 WIP:正在积极开发中,很快将会支持。" + +#: ../../user_guide/support_matrix/supported_features.md:36 +msgid "" +"🟡 Planned: Scheduled for future implementation (some may have open " +"PRs/RFCs)." +msgstr "🟡 计划中:已安排将来实现(其中一些可能已有开放的PR/RFC)。" + +#: ../../user_guide/support_matrix/supported_features.md:37 +msgid "🔴 NO plan / Deprecated: No plan for V0 or deprecated by vLLM v1." +msgstr "🔴 没有计划 / 已弃用:V0 没有计划或已被 vLLM v1 弃用。" diff --git a/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po new file mode 100644 index 0000000..8ec7805 --- /dev/null +++ b/docs/source/locale/zh_CN/LC_MESSAGES/user_guide/support_matrix/supported_models.po @@ -0,0 +1,214 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) 2025, vllm-ascend team +# This file is distributed under the same license as the vllm-ascend +# package. +# FIRST AUTHOR , 2025. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: vllm-ascend\n" +"Report-Msgid-Bugs-To: \n" +"POT-Creation-Date: 2025-07-18 09:01+0800\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: zh_CN \n" +"Language: zh_CN\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"Generated-By: Babel 2.17.0\n" + +#: ../../user_guide/support_matrix/supported_models.md:1 +msgid "Model Support" +msgstr "模型支持" + +#: ../../user_guide/support_matrix/supported_models.md:3 +msgid "Text-only Language Models" +msgstr "纯文本语言模型" + +#: ../../user_guide/support_matrix/supported_models.md:5 +#: ../../user_guide/support_matrix/supported_models.md:38 +msgid "Generative Models" +msgstr "生成模型" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Model" +msgstr "模型" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Supported" +msgstr "支持" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Note" +msgstr "注释" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "DeepSeek v3" +msgstr "DeepSeek v3" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "✅" +msgstr "✅" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "DeepSeek R1" +msgstr "DeepSeek R1" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "DeepSeek Distill (Qwen/LLama)" +msgstr "DeepSeek 精炼(Qwen/LLama)" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen3" +msgstr "Qwen3" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen3-Moe" +msgstr "Qwen3-Moe" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen2.5" +msgstr "Qwen2.5" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "QwQ-32B" +msgstr "QwQ-32B" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLama3.1/3.2" +msgstr "LLama3.1/3.2" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Internlm" +msgstr "Internlm" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Baichuan" +msgstr "百川" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Phi-4-mini" +msgstr "Phi-4-mini" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "MiniCPM" +msgstr "MiniCPM" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "MiniCPM3" +msgstr "MiniCPM3" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLama4" +msgstr "LLama4" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Mistral" +msgstr "Mistral" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Need test" +msgstr "需要测试" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "DeepSeek v2.5" +msgstr "DeepSeek v2.5" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Gemma-2" +msgstr "Gemma-2" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Mllama" +msgstr "Mllama" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Gemma-3" +msgstr "Gemma-3" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "❌" +msgstr "❌" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "[#496](https://github.com/vllm-project/vllm-ascend/issues/496)" +msgstr "[#496](https://github.com/vllm-project/vllm-ascend/issues/496)" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "ChatGLM" +msgstr "ChatGLM" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "[#554](https://github.com/vllm-project/vllm-ascend/issues/554)" +msgstr "[#554](https://github.com/vllm-project/vllm-ascend/issues/554)" + +#: ../../user_guide/support_matrix/supported_models.md:29 +msgid "Pooling Models" +msgstr "池化模型" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "XLM-RoBERTa-based" +msgstr "基于XLM-RoBERTa" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Molmo" +msgstr "Molmo" + +#: ../../user_guide/support_matrix/supported_models.md:36 +msgid "Multimodal Language Models" +msgstr "多模态语言模型" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen2-VL" +msgstr "Qwen2-VL" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen2.5-VL" +msgstr "Qwen2.5-VL" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLaVA 1.5" +msgstr "LLaVA 1.5" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLaVA 1.6" +msgstr "LLaVA 1.6" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "[#553](https://github.com/vllm-project/vllm-ascend/issues/553)" +msgstr "[#553](https://github.com/vllm-project/vllm-ascend/issues/553)" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "InternVL2" +msgstr "InternVL2" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "InternVL2.5" +msgstr "InternVL2.5" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Qwen2-Audio" +msgstr "Qwen2-Audio" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLaVA-Next" +msgstr "LLaVA-Next" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "LLaVA-Next-Video" +msgstr "LLaVA-Next-Video" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Phi-3-Vison/Phi-3.5-Vison" +msgstr "Phi-3-Vison/Phi-3.5-Vison" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "GLM-4v" +msgstr "GLM-4v" + +#: ../../user_guide/support_matrix/supported_models.md +msgid "Ultravox" +msgstr "Ultravox" diff --git a/docs/source/logos/vllm-ascend-logo-text-dark.png b/docs/source/logos/vllm-ascend-logo-text-dark.png new file mode 100644 index 0000000..f534d09 Binary files /dev/null and b/docs/source/logos/vllm-ascend-logo-text-dark.png differ diff --git a/docs/source/logos/vllm-ascend-logo-text-light.png b/docs/source/logos/vllm-ascend-logo-text-light.png new file mode 100644 index 0000000..b71b492 Binary files /dev/null and b/docs/source/logos/vllm-ascend-logo-text-light.png differ diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md new file mode 100644 index 0000000..d61394f --- /dev/null +++ b/docs/source/quick_start.md @@ -0,0 +1,186 @@ +# Quickstart + +## Prerequisites + +### Supported Devices +- Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2) +- Atlas 800I A2 Inference series (Atlas 800I A2) +- Atlas A3 Training series (Atlas 800T A3, Atlas 900 A3 SuperPoD, Atlas 9000 A3 SuperPoD) +- Atlas 800I A3 Inference series (Atlas 800I A3) +- [Experimental] Atlas 300I Inference series (Atlas 300I Duo) + +## Setup environment using container + +:::::{tab-set} +::::{tab-item} Ubuntu + +```{code-block} bash + :substitutions: + +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci0 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +# Install curl +apt-get update -y && apt-get install -y curl +``` + +:::: + +::::{tab-item} openEuler + +```{code-block} bash + :substitutions: + +# Update DEVICE according to your device (/dev/davinci[0-7]) +export DEVICE=/dev/davinci0 +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-openeuler +docker run --rm \ +--name vllm-ascend \ +--device $DEVICE \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +# Install curl +yum update -y && yum install -y curl +``` + +:::: +::::: + +The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation. + +## Usage + +You can use Modelscope mirror to speed up download: + + + +```bash +export VLLM_USE_MODELSCOPE=true +``` + +There are two ways to start vLLM on Ascend NPU: + +:::::{tab-set} +::::{tab-item} Offline Batched Inference + +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). + +Try to run below Python script directly or use `python3` shell to generate texts: + + + +```python +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +# The first run will take about 3-5 mins (10 MB/s) to download models +llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") + +outputs = llm.generate(prompts, sampling_params) + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +:::: + +::::{tab-item} OpenAI Completions API + +vLLM can also be deployed as a server that implements the OpenAI API protocol. Run +the following command to start the vLLM server with the +[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model: + + + +```bash +# Deploy vLLM server (The first run will take about 3-5 mins (10 MB/s) to download models) +vllm serve Qwen/Qwen2.5-0.5B-Instruct & +``` + +If you see log as below: + +``` +INFO: Started server process [3594] +INFO: Waiting for application startup. +INFO: Application startup complete. +INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +``` + +Congratulations, you have successfully started the vLLM server! + +You can query the list the models: + + + +```bash +curl http://localhost:8000/v1/models | python3 -m json.tool +``` + +You can also query the model with input prompts: + + + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "Beijing is a", + "max_tokens": 5, + "temperature": 0 + }' | python3 -m json.tool +``` + +vLLM is serving as background process, you can use `kill -2 $VLLM_PID` to stop the background process gracefully, +it's equal to `Ctrl-C` to stop foreground vLLM process: + + + +```bash + VLLM_PID=$(pgrep -f "vllm serve") + kill -2 "$VLLM_PID" +``` + +You will see output as below: + +``` +INFO: Shutting down FastAPI HTTP server. +INFO: Shutting down +INFO: Waiting for application shutdown. +INFO: Application shutdown complete. +``` + +Finally, you can exit container by using `ctrl-D`. +:::: +::::: diff --git a/docs/source/tutorials/index.md b/docs/source/tutorials/index.md new file mode 100644 index 0000000..3c9d38f --- /dev/null +++ b/docs/source/tutorials/index.md @@ -0,0 +1,18 @@ +# Tutorials + +:::{toctree} +:caption: Deployment +:maxdepth: 1 +single_npu +single_npu_multimodal +single_npu_audio +single_npu_qwen3_embedding +single_npu_qwen3_quantization +multi_npu +multi_npu_moge +multi_npu_qwen3_moe +multi_npu_quantization +single_node_300i +multi_node +multi_node_kimi +::: diff --git a/docs/source/tutorials/multi_node.md b/docs/source/tutorials/multi_node.md new file mode 100644 index 0000000..e4ca3c5 --- /dev/null +++ b/docs/source/tutorials/multi_node.md @@ -0,0 +1,207 @@ +# Multi-Node-DP (DeepSeek) + +## Getting Start +vLLM-Ascend now supports Data Parallel (DP) deployment, enabling model weights to be replicated across multiple NPUs or instances, each processing independent batches of requests. This is particularly useful for scaling throughput across devices while maintaining high resource utilization. + +Each DP rank is deployed as a separate “core engine” process which communicates with front-end process(es) via ZMQ sockets. Data Parallel can be combined with Tensor Parallel, in which case each DP engine owns a number of per-NPU worker processes equal to the TP size. + +For Mixture-of-Experts (MoE) models — especially advanced architectures like DeepSeek that utilize Multi-head Latent Attention (MLA) — a hybrid parallelism approach is recommended: + +- Use **Data Parallelism (DP)** for attention layers, which are replicated across devices and handle separate batches. +- Use **Expert or Tensor Parallelism (EP/TP)** for expert layers, which are sharded across devices to distribute the computation. + +This division enables attention layers to be replicated across Data Parallel (DP) ranks, enabling them to process different batches independently. Meanwhile, expert layers are partitioned (sharded) across devices using Expert or Tensor Parallelism(DP*TP), maximizing hardware utilization and efficiency. + +In these cases the data parallel ranks are not completely independent, forward passes must be aligned and expert layers across all ranks are required to synchronize during every forward pass, even if there are fewer requests to be processed than DP ranks. + +For MoE models, when any requests are in progress in any rank, we must ensure that empty “dummy” forward passes are performed in all ranks which don’t currently have any requests scheduled. This is handled via a separate DP `Coordinator` process which communicates with all of the ranks, and a collective operation performed every N steps to determine when all ranks become idle and can be paused. When TP is used in conjunction with DP, expert layers form an EP or TP group of size (DP x TP). + +## Verify Multi-Node Communication Environment + +### Physical Layer Requirements: + +- The physical machines must be located on the same WLAN, with network connectivity. +- All NPUs are connected with optical modules, and the connection status must be normal. + +### Verification Process: + +Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`: + +```bash + # Check the remote switch ports + for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done + # Get the link status of the Ethernet ports (UP or DOWN) + for i in {0..7}; do hccn_tool -i $i -link -g ; done + # Check the network health status + for i in {0..7}; do hccn_tool -i $i -net_health -g ; done + # View the network detected IP configuration + for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done + # View gateway configuration + for i in {0..7}; do hccn_tool -i $i -gateway -g ; done + # View NPU network configuration + cat /etc/hccn.conf +``` + +### NPU Interconnect Verification: +#### 1. Get NPU IP Addresses + +```bash +for i in {0..7}; do hccn_tool -i $i -ip -g | grep ipaddr; done +``` + +#### 2. Cross-Node PING Test + +```bash +# Execute on the target node (replace with actual IP) +hccn_tool -i 0 -ping -g address 10.20.0.20 +``` + +## Run with docker +Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the `deepseek-v3.1-w8a8` quantitative model across multi-node. + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| +export NAME=vllm-ascend + +# Run the container using the defined variables +# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance +docker run --rm \ +--name $NAME \ +--net=host \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /mnt/sfs_turbo/.cache:/root/.cache \ +-it $IMAGE bash +``` + +Run the following scripts on two nodes respectively + +:::{note} +Before launch the inference server, ensure the following environment variables are set for multi node communication +::: + +**node0** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip +nic_name="xxxx" +local_ip="xxxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 + +# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/DeepSeek-V3.1-W8A8 +# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html +vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ +--host 0.0.0.0 \ +--port 8004 \ +--data-parallel-size 4 \ +--data-parallel-size-local 2 \ +--data-parallel-address $local_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 4 \ +--seed 1024 \ +--served-model-name deepseek_v3.1 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--quantization ascend \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.9 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +``` + +**node1** + +```shell +#!/bin/sh + +nic_name="xxx" +local_ip="xxx" + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=1024 + +vllm serve vllm-ascend/DeepSeek-V3.1-W8A8 \ +--host 0.0.0.0 \ +--port 8004 \ +--headless \ +--data-parallel-size 4 \ +--data-parallel-size-local 2 \ +--data-parallel-start-rank 2 \ +--data-parallel-address { node0 ip } \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 4 \ +--seed 1024 \ +--quantization ascend \ +--served-model-name deepseek_v3.1 \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--max-num-batched-tokens 4096 \ +--enable-expert-parallel \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +``` + +The Deployment view looks like: +![alt text](../assets/multi_node_dp_deepseek.png) + +Once your server is started, you can query the model with input prompts: + +```shell +curl http://{ node0 ip:8004 }/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek_v3.1", + "prompt": "The future of AI is", + "max_tokens": 50, + "temperature": 0 + }' +``` + +## Run benchmarks +For details please refer to [benchmark](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks) + +```shell +export VLLM_USE_MODELSCOPE=true +vllm bench serve --model vllm-ascend/DeepSeek-V3.1-W8A8 --served-model-name deepseek_v3.1 \ +--dataset-name random --random-input-len 128 --random-output-len 128 \ +--num-prompts 200 --trust-remote-code --base-url "http://{ node0 ip }:8004" --request-rate 1 +``` diff --git a/docs/source/tutorials/multi_node_kimi.md b/docs/source/tutorials/multi_node_kimi.md new file mode 100644 index 0000000..dfada85 --- /dev/null +++ b/docs/source/tutorials/multi_node_kimi.md @@ -0,0 +1,153 @@ +# Multi-Node-DP (Kimi-K2) + +## Verify Multi-Node Communication Environment + +referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process) + +## Run with docker +Assume you have two Atlas 800 A3(64G*16) nodes(or 4 * A2), and want to deploy the `Kimi-K2-Instruct-W8A8` quantitative model across multi-node. + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| +export NAME=vllm-ascend + +# Run the container using the defined variables +# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance +docker run --rm \ +--name $NAME \ +--net=host \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci8 \ +--device /dev/davinci9 \ +--device /dev/davinci10 \ +--device /dev/davinci11 \ +--device /dev/davinci12 \ +--device /dev/davinci13 \ +--device /dev/davinci14 \ +--device /dev/davinci15 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /mnt/sfs_turbo/.cache:/home/cache \ +-it $IMAGE bash +``` + +Run the following scripts on two nodes respectively + +:::{note} +Before launch the inference server, ensure the following environment variables are set for multi node communication +::: + +**node0** + +```shell +#!/bin/sh + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip +nic_name="xxxx" +local_ip="xxxx" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=1024 + +# The w8a8 weight can obtained from https://www.modelscope.cn/models/vllm-ascend/Kimi-K2-Instruct-W8A8 +# If you want to the quantization manually, please refer to https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/quantization.html +vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ +--host 0.0.0.0 \ +--port 8004 \ +--data-parallel-size 4 \ +--api-server-count 2 \ +--data-parallel-size-local 2 \ +--data-parallel-address $local_ip \ +--data-parallel-rpc-port 13389 \ +--seed 1024 \ +--served-model-name kimi \ +--quantization ascend \ +--tensor-parallel-size 8 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.9 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +``` + +**node1** + +```shell +#!/bin/sh + +nic_name="xxxx" +local_ip="xxxx" + +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=1024 + +vllm serve /home/cache/weights/Kimi-K2-Instruct-W8A8 \ +--host 0.0.0.0 \ +--port 8004 \ +--headless \ +--data-parallel-size 4 \ +--data-parallel-size-local 2 \ +--data-parallel-start-rank 2 \ +--data-parallel-address $node0_ip \ +--data-parallel-rpc-port 13389 \ +--seed 1024 \ +--tensor-parallel-size 8 \ +--served-model-name kimi \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--quantization ascend \ +--max-num-batched-tokens 4096 \ +--enable-expert-parallel \ +--trust-remote-code \ +--no-enable-prefix-caching \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":true}}' +``` + +The Deployment view looks like: +![alt text](../assets/multi_node_dp_kimi.png) + +Once your server is started, you can query the model with input prompts: + +```shell +curl http://{ node0 ip:8004 }/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "kimi", + "prompt": "The future of AI is", + "max_tokens": 50, + "temperature": 0 + }' +``` diff --git a/docs/source/tutorials/multi_npu.md b/docs/source/tutorials/multi_npu.md new file mode 100644 index 0000000..e59b725 --- /dev/null +++ b/docs/source/tutorials/multi_npu.md @@ -0,0 +1,107 @@ +# Multi-NPU (QwQ 32B) + +## Run vllm-ascend on Multi-NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +### Online Inference on Multi-NPU + +Run the following script to start the vLLM server on Multi-NPU: + +```bash +vllm serve Qwen/QwQ-32B --max-model-len 4096 --port 8000 -tp 4 +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/QwQ-32B", + "prompt": "QwQ-32B是什么?", + "max_tokens": "128", + "top_p": "0.95", + "top_k": "40", + "temperature": "0.6" + }' +``` + +### Offline Inference on Multi-NPU + +Run the following script to execute offline inference on multi-NPU: + +```python +import gc + +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) +llm = LLM(model="Qwen/QwQ-32B", + tensor_parallel_size=4, + distributed_executor_backend="mp", + max_model_len=4096) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +del llm +clean_up() +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' +Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' +``` diff --git a/docs/source/tutorials/multi_npu_moge.md b/docs/source/tutorials/multi_npu_moge.md new file mode 100644 index 0000000..7fc85e1 --- /dev/null +++ b/docs/source/tutorials/multi_npu_moge.md @@ -0,0 +1,242 @@ +# Multi-NPU (Pangu Pro MoE) + +## Run vllm-ascend on Multi-NPU + +Run container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +Download the model: + +```bash +git lfs install +git clone https://gitcode.com/ascend-tribe/pangu-pro-moe-model.git +``` + +### Online Inference on Multi-NPU + +Run the following script to start the vLLM server on Multi-NPU: + +```bash +vllm serve /path/to/pangu-pro-moe-model \ +--tensor-parallel-size 4 \ +--enable-expert-parallel \ +--trust-remote-code \ +--enforce-eager +``` + +Once your server is started, you can query the model with input prompts: + +:::::{tab-set} +::::{tab-item} v1/completions + +```{code-block} bash + :substitutions: +export question="你是谁?" +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "[unused9]系统:[unused10][unused9]用户:'${question}'[unused10][unused9]助手:", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: + +::::{tab-item} v1/chat/completions + +```{code-block} bash + :substitutions: +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "system", "content": ""}, + {"role": "user", "content": "你是谁?"} + ], + "max_tokens": "64", + "top_p": "0.95", + "top_k": "50", + "temperature": "0.6", + "add_special_tokens" : true + }' +``` + +:::: +::::: + +If you run this successfully, you can see the info shown below: + +```json +{"id":"cmpl-2cd4223228ab4be9a91f65b882e65b32","object":"text_completion","created":1751255067,"model":"/root/.cache/pangu-pro-moe-model","choices":[{"index":0,"text":" [unused16] 好的,用户问我是谁,我需要根据之前的设定来回答。用户提到我是华为开发的“盘古Reasoner”,属于盘古大模型系列,作为智能助手帮助解答问题和提供 信息支持。现在用户再次询问,可能是在确认我的身份或者测试我的回答是否一致。\n\n首先,我要确保","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":15,"total_tokens":79,"completion_tokens":64,"prompt_tokens_details":null},"kv_transfer_params":null} +``` + +### Offline Inference on Multi-NPU + +Run the following script to execute offline inference on multi-NPU: + +:::::{tab-set} +::::{tab-item} Graph Mode + +```{code-block} python + :substitutions: +import gc +from transformers import AutoTokenizer +import torch +import os + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + + +if __name__ == "__main__": + + tokenizer = AutoTokenizer.from_pretrained("/path/to/pangu-pro-moe-model", trust_remote_code=True) + tests = [ + "Hello, my name is", + "The future of AI is", + ] + prompts = [] + for text in tests: + messages = [ + {"role": "system", "content": ""}, # Optionally customize system content + {"role": "user", "content": text} + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + prompts.append(prompt) + + sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) + + llm = LLM(model="/path/to/pangu-pro-moe-model", + tensor_parallel_size=4, + enable_expert_parallel=True, + distributed_executor_backend="mp", + max_model_len=1024, + trust_remote_code=True, + additional_config={ + 'torchair_graph_config': { + 'enabled': True, + }, + 'ascend_scheduler_config':{ + 'enabled': True, + 'enable_chunked_prefill' : False, + 'chunked_prefill_enabled': False + }, + }) + + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + del llm + clean_up() +``` + +:::: + +::::{tab-item} Eager Mode + +```{code-block} python + :substitutions: +import gc +from transformers import AutoTokenizer +import torch +import os + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + + +if __name__ == "__main__": + + tokenizer = AutoTokenizer.from_pretrained("/path/to/pangu-pro-moe-model", trust_remote_code=True) + tests = [ + "Hello, my name is", + "The future of AI is", + ] + prompts = [] + for text in tests: + messages = [ + {"role": "system", "content": ""}, # Optionally customize system content + {"role": "user", "content": text} + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + prompts.append(prompt) + + sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) + + llm = LLM(model="/path/to/pangu-pro-moe-model", + tensor_parallel_size=4, + distributed_executor_backend="mp", + max_model_len=1024, + trust_remote_code=True, + enforce_eager=True) + + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + del llm + clean_up() +``` + +:::: +::::: + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' +Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' +``` diff --git a/docs/source/tutorials/multi_npu_quantization.md b/docs/source/tutorials/multi_npu_quantization.md new file mode 100644 index 0000000..63bc489 --- /dev/null +++ b/docs/source/tutorials/multi_npu_quantization.md @@ -0,0 +1,137 @@ +# Multi-NPU (QwQ 32B W8A8) + +## Run docker container +:::{note} +w8a8 quantization feature is supported by v0.8.4rc2 or higher +::: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +## Install modelslim and convert model +:::{note} +You can choose to convert the model yourself or use the quantized model we uploaded, +see https://www.modelscope.cn/models/vllm-ascend/QwQ-32B-W8A8 +::: + +```bash +# (Optional)This tag is recommended and has been verified +git clone https://gitee.com/ascend/msit -b modelslim-VLLM-8.1.RC1.b020_001 + +cd msit/msmodelslim +# Install by run this script +bash install.sh +pip install accelerate + +cd example/Qwen +# Original weight path, Replace with your local model path +MODEL_PATH=/home/models/QwQ-32B +# Path to save converted weight, Replace with your local path +SAVE_PATH=/home/models/QwQ-32B-w8a8 + +# In this conversion process, the npu device is not must, you can also set --device_type cpu to have a conversion +python3 quant_qwen.py --model_path $MODEL_PATH --save_directory $SAVE_PATH --calib_file ../common/boolq.jsonl --w_bit 8 --a_bit 8 --device_type npu --anti_method m1 --trust_remote_code True +``` + +## Verify the quantized model +The converted model files looks like: + +```bash +. +|-- config.json +|-- configuration.json +|-- generation_config.json +|-- quant_model_description.json +|-- quant_model_weight_w8a8.safetensors +|-- README.md +|-- tokenizer.json +`-- tokenizer_config.json +``` + +Run the following script to start the vLLM server with quantized model: + +:::{note} +The value "ascend" for "--quantization" argument will be supported after [a specific PR](https://github.com/vllm-project/vllm-ascend/pull/877) is merged and released, you can cherry-pick this commit for now. +::: + +```bash +vllm serve /home/models/QwQ-32B-w8a8 --tensor-parallel-size 4 --served-model-name "qwq-32b-w8a8" --max-model-len 4096 --quantization ascend +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwq-32b-w8a8", + "prompt": "what is large language model?", + "max_tokens": "128", + "top_p": "0.95", + "top_k": "40", + "temperature": "0.0" + }' +``` + +Run the following script to execute offline inference on multi-NPU with quantized model: + +:::{note} +To enable quantization for ascend, quantization method must be "ascend" +::: + +```python +import gc + +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) + +llm = LLM(model="/home/models/QwQ-32B-w8a8", + tensor_parallel_size=4, + distributed_executor_backend="mp", + max_model_len=4096, + quantization="ascend") + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +del llm +clean_up() +``` diff --git a/docs/source/tutorials/multi_npu_qwen3_moe.md b/docs/source/tutorials/multi_npu_qwen3_moe.md new file mode 100644 index 0000000..4ac9bca --- /dev/null +++ b/docs/source/tutorials/multi_npu_qwen3_moe.md @@ -0,0 +1,109 @@ +# Multi-NPU (Qwen3-30B-A3B) + +## Run vllm-ascend on Multi-NPU with Qwen3 MoE + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +### Online Inference on Multi-NPU + +Run the following script to start the vLLM server on Multi-NPU: + +For an Atlas A2 with 64GB of NPU card memory, tensor-parallel-size should be at least 2, and for 32GB of memory, tensor-parallel-size should be at least 4. + +```bash +vllm serve Qwen/Qwen3-30B-A3B --tensor-parallel-size 4 --enable_expert_parallel +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-30B-A3B", + "messages": [ + {"role": "user", "content": "Give me a short introduction to large language models."} + ], + "temperature": 0.6, + "top_p": 0.95, + "top_k": 20, + "max_tokens": 4096 +}' +``` + +### Offline Inference on Multi-NPU + +Run the following script to execute offline inference on multi-NPU: + +```python +import gc +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) +llm = LLM(model="Qwen/Qwen3-30B-A3B", + tensor_parallel_size=4, + distributed_executor_backend="mp", + max_model_len=4096, + enable_expert_parallel=True) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +del llm +clean_up() +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: " Lucy. I'm from the UK and I'm 11 years old." +Prompt: 'The future of AI is', Generated text: ' a topic that has captured the imagination of scientists, philosophers, and the general public' +``` diff --git a/docs/source/tutorials/single_node_300i.md b/docs/source/tutorials/single_node_300i.md new file mode 100644 index 0000000..270d002 --- /dev/null +++ b/docs/source/tutorials/single_node_300i.md @@ -0,0 +1,406 @@ +# Single Node (Atlas 300I series) + +```{note} +1. This Atlas 300I series is currently experimental. In future versions, there may be behavioral changes around model coverage, performance improvement. +2. Currently, the 310I series only supports eager mode and the data type is float16. +``` + +## Run vLLM on Altlas 300I series + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|-310p +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci1 \ +--device /dev/davinci2 \ +--device /dev/davinci3 \ +--device /dev/davinci4 \ +--device /dev/davinci5 \ +--device /dev/davinci6 \ +--device /dev/davinci7 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +### Online Inference on NPU + +Run the following script to start the vLLM server on NPU(Qwen3-0.6B:1 card, Qwen2.5-7B-Instruct:2 cards, Pangu-Pro-MoE-72B: 8 cards): + +:::::{tab-set} +:sync-group: inference + +::::{tab-item} Qwen3-0.6B +:selected: +:sync: qwen0.6 + +Run the following command to start the vLLM server: + +```{code-block} bash + :substitutions: +vllm serve Qwen/Qwen3-0.6B \ + --tensor-parallel-size 1 \ + --enforce-eager \ + --dtype float16 \ + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: + +::::{tab-item} Qwen2.5-7B-Instruct +:sync: qwen7b + +Run the following command to start the vLLM server: + +```{code-block} bash + :substitutions: +vllm serve Qwen/Qwen2.5-7B-Instruct \ + --tensor-parallel-size 2 \ + --enforce-eager \ + --dtype float16 \ + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: + +::::{tab-item} Qwen2.5-VL-3B-Instruct +:sync: qwen-vl-2.5-3b + +Run the following command to start the vLLM server: + +```{code-block} bash + :substitutions: +vllm serve Qwen/Qwen2.5-VL-3B-Instruct \ + --tensor-parallel-size 1 \ + --enforce-eager \ + --dtype float16 \ + --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "The future of AI is", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: + +::::{tab-item} Pangu-Pro-MoE-72B +:sync: pangu + +Download the model: + +```bash +git lfs install +git clone https://gitcode.com/ascend-tribe/pangu-pro-moe-model.git +``` + +Run the following command to start the vLLM server: + +```{code-block} bash + :substitutions: + +vllm serve /home/pangu-pro-moe-mode/ \ +--tensor-parallel-size 4 \ +--enable-expert-parallel \ +--dtype "float16" \ +--trust-remote-code \ +--enforce-eager + +``` + +Once your server is started, you can query the model with input prompts + +```bash +export question="你是谁?" +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "[unused9]系统:[unused10][unused9]用户:'${question}'[unused10][unused9]助手:", + "max_tokens": 64, + "top_p": 0.95, + "top_k": 50, + "temperature": 0.6 + }' +``` + +:::: +::::: + +If you run this script successfully, you can see the results. + +### Offline Inference + +Run the following script (`example.py`) to execute offline inference on NPU: + +:::::{tab-set} +:sync-group: inference + +::::{tab-item} Qwen3-0.6B +:selected: +:sync: qwen0.6 + +```{code-block} python + :substitutions: +from vllm import LLM, SamplingParams +import gc +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() +prompts = [ + "Hello, my name is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +llm = LLM( + model="Qwen/Qwen3-0.6B", + tensor_parallel_size=1, + enforce_eager=True, # For 300I series, only eager mode is supported. + dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series +) +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +del llm +clean_up() +``` + +:::: + +::::{tab-item} Qwen2.5-7B-Instruct +:sync: qwen7b + +```{code-block} python + :substitutions: +from vllm import LLM, SamplingParams +import gc +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() +prompts = [ + "Hello, my name is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +llm = LLM( + model="Qwen/Qwen2.5-7B-Instruct", + tensor_parallel_size=2, + enforce_eager=True, # For 300I series, only eager mode is supported. + dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series +) +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +del llm +clean_up() +``` + +:::: + +::::{tab-item} Qwen2.5-VL-3B-Instruct +:sync: qwen-vl-2.5-3b + +```{code-block} python + :substitutions: +from vllm import LLM, SamplingParams +import gc +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() +prompts = [ + "Hello, my name is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, top_p=0.95, top_k=50, temperature=0.6) +# Create an LLM. +llm = LLM( + model="Qwen/Qwen2.5-VL-3B-Instruct", + tensor_parallel_size=1, + enforce_eager=True, # For 300I series, only eager mode is supported. + dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series + compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series +) +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +del llm +clean_up() +``` + +:::: + +::::{tab-item} Pangu-Pro-MoE-72B +:sync: pangu + +Download the model: + +```bash +git lfs install +git clone https://gitcode.com/ascend-tribe/pangu-pro-moe-model.git +``` + +```{code-block} python + :substitutions: + +import gc +from transformers import AutoTokenizer +import torch + +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import (destroy_distributed_environment, + destroy_model_parallel) + +def clean_up(): + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + + +if __name__ == "__main__": + + tokenizer = AutoTokenizer.from_pretrained("/home/pangu-pro-moe-mode/", trust_remote_code=True) + tests = [ + "Hello, my name is", + "The future of AI is", + ] + prompts = [] + for text in tests: + messages = [ + {"role": "system", "content": ""}, # Optionally customize system content + {"role": "user", "content": text} + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # 推荐使用官方的template + prompts.append(prompt) + sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40) + + llm = LLM(model="/home/pangu-pro-moe-mode/", + tensor_parallel_size=8, + distributed_executor_backend="mp", + enable_expert_parallel=True, + dtype="float16", + max_model_len=1024, + trust_remote_code=True, + enforce_eager=True) + + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + del llm + clean_up() +``` + +:::: +::::: + +Run script: + +```bash +python example.py +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the US. I want to know if there are any opportunities in the US for me to work. I'm also interested in the culture and lifestyle in the US. I want to know if there are any opportunities for me to work in the US. I'm also interested in the culture and lifestyle in the US. I'm interested in the culture" +Prompt: 'The future of AI is', Generated text: " not just about the technology itself, but about how we use it to solve real-world problems. As AI continues to evolve, it's important to consider the ethical implications of its use. AI has the potential to bring about significant changes in society, but it also has the power to create new challenges. Therefore, it's crucial to develop a comprehensive approach to AI that takes into account both the benefits and the risks associated with its use. This includes addressing issues such as bias, privacy, and accountability." +``` diff --git a/docs/source/tutorials/single_npu.md b/docs/source/tutorials/single_npu.md new file mode 100644 index 0000000..fc8a266 --- /dev/null +++ b/docs/source/tutorials/single_npu.md @@ -0,0 +1,202 @@ +# Single NPU (Qwen3 8B) + +## Run vllm-ascend on Single NPU + +### Offline Inference on Single NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +:::{note} +`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +::: + +Run the following script to execute offline inference on a single NPU: + +:::::{tab-set} +::::{tab-item} Graph Mode + +```{code-block} python + :substitutions: +import os +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +llm = LLM( + model="Qwen/Qwen3-8B", + max_model_len=26240 +) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +:::: + +::::{tab-item} Eager Mode + +```{code-block} python + :substitutions: +import os +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The future of AI is", +] +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) +llm = LLM( + model="Qwen/Qwen3-8B", + max_model_len=26240, + enforce_eager=True +) + +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +:::: +::::: + +If you run this script successfully, you can see the info shown below: + +```bash +Prompt: 'Hello, my name is', Generated text: ' Daniel and I am an 8th grade student at York Middle School. I' +Prompt: 'The future of AI is', Generated text: ' following you. As the technology advances, a new report from the Institute for the' +``` + +### Online Serving on Single NPU + +Run docker container to start the vLLM server on a single NPU: + +:::::{tab-set} +::::{tab-item} Graph Mode + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen3-8B --max_model_len 26240 +``` + +:::: + +::::{tab-item} Eager Mode + +```{code-block} bash + :substitutions: +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen3-8B --max_model_len 26240 --enforce-eager +``` + +:::: +::::: + +:::{note} +Add `--max_model_len` option to avoid ValueError that the Qwen2.5-7B model's max seq len (32768) is larger than the maximum number of tokens that can be stored in KV cache (26240). This will differ with different NPU series base on the HBM size. Please modify the value according to a suitable value for your NPU series. +::: + +If your service start successfully, you can see the info shown below: + +```bash +INFO: Started server process [6873] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once your server is started, you can query the model with input prompts: + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen3-8B", + "prompt": "The future of AI is", + "max_tokens": 7, + "temperature": 0 + }' +``` + +If you query the server successfully, you can see the info shown below (client): + +```bash +{"id":"cmpl-b25a59a2f985459781ce7098aeddfda7","object":"text_completion","created":1739523925,"model":"Qwen/Qwen3-8B","choices":[{"index":0,"text":" here. It’s not just a","logprobs":null,"finish_reason":"length","stop_reason":null,"prompt_logprobs":null}],"usage":{"prompt_tokens":5,"total_tokens":12,"completion_tokens":7,"prompt_tokens_details":null}} +``` + +Logs of the vllm server: + +```bash +INFO: 172.17.0.1:49518 - "POST /v1/completions HTTP/1.1" 200 OK +INFO 02-13 08:34:35 logger.py:39] Received request cmpl-574f00e342904692a73fb6c1c986c521-0: prompt: 'San Francisco is a', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=7, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None), prompt_token_ids: [23729, 12879, 374, 264], lora_request: None, prompt_adapter_request: None. +``` diff --git a/docs/source/tutorials/single_npu_audio.md b/docs/source/tutorials/single_npu_audio.md new file mode 100644 index 0000000..137d761 --- /dev/null +++ b/docs/source/tutorials/single_npu_audio.md @@ -0,0 +1,122 @@ +# Single NPU (Qwen2-Audio 7B) + +## Run vllm-ascend on Single NPU + +### Offline Inference on Single NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +:::{note} +`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +::: + +Install packages required for audio processing: + +```bash +pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +pip install librosa soundfile +``` + +Run the following script to execute offline inference on a single NPU: + +```python +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +# If network issues prevent AudioAsset from fetching remote audio files, retry or check your network. +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = { + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} + + +def prepare_inputs(audio_count: int): + audio_in_prompt = "".join([ + f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + for idx in range(audio_count) + ]) + question = question_per_audio_count[audio_count] + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + + mm_data = { + "audio": + [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] + } + + # Merge text prompt and audio data into inputs + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + return inputs + + +def main(audio_count: int): + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on + # lower-end GPUs. + # Unless specified, these settings have been tested to work on a single L4. + # `limit_mm_per_prompt`: the max num items for each modality per prompt. + llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + inputs = prepare_inputs(audio_count) + + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=None) + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + audio_count = 2 + main(audio_count) +``` + +If you run this script successfully, you can see the info shown below: + +```bash +The sport referenced is baseball, and the nursery rhyme is 'Mary Had a Little Lamb'. +``` + +### Online Serving on Single NPU + +Currently, vllm's OpenAI-compatible server doesn't support audio inputs, find more details [here](https://github.com/vllm-project/vllm/issues/19977). diff --git a/docs/source/tutorials/single_npu_multimodal.md b/docs/source/tutorials/single_npu_multimodal.md new file mode 100644 index 0000000..a678ec7 --- /dev/null +++ b/docs/source/tutorials/single_npu_multimodal.md @@ -0,0 +1,192 @@ +# Single NPU (Qwen2.5-VL 7B) + +## Run vllm-ascend on Single NPU + +### Offline Inference on Single NPU + +Run docker container: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +:::{note} +`max_split_size_mb` prevents the native allocator from splitting blocks larger than this size (in MB). This can reduce fragmentation and may allow some borderline workloads to complete without running out of memory. You can find more details [here](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/800alpha003/apiref/envref/envref_07_0061.html). +::: + +Run the following script to execute offline inference on a single NPU: + +```bash +pip install qwen_vl_utils --extra-index-url https://download.pytorch.org/whl/cpu/ +``` + +```python +from transformers import AutoProcessor +from vllm import LLM, SamplingParams +from qwen_vl_utils import process_vision_info + +MODEL_PATH = "Qwen/Qwen2.5-VL-7B-Instruct" + +llm = LLM( + model=MODEL_PATH, + max_model_len=16384, + limit_mm_per_prompt={"image": 10}, +) + +sampling_params = SamplingParams( + max_tokens=512 +) + +image_messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png", + "min_pixels": 224 * 224, + "max_pixels": 1280 * 28 * 28, + }, + {"type": "text", "text": "Please provide a detailed description of this image"}, + ], + }, +] + +messages = image_messages + +processor = AutoProcessor.from_pretrained(MODEL_PATH) +prompt = processor.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, +) + +image_inputs, _, _ = process_vision_info(messages, return_video_kwargs=True) + +mm_data = {} +if image_inputs is not None: + mm_data["image"] = image_inputs + +llm_inputs = { + "prompt": prompt, + "multi_modal_data": mm_data, +} + +outputs = llm.generate([llm_inputs], sampling_params=sampling_params) +generated_text = outputs[0].outputs[0].text + +print(generated_text) +``` + +If you run this script successfully, you can see the info shown below: + +```bash +The image displays a logo consisting of two main elements: a stylized geometric design and a pair of text elements. + +1. **Geometric Design**: On the left side of the image, there is a blue geometric design that appears to be made up of interconnected shapes. These shapes resemble a network or a complex polygonal structure, possibly hinting at a technological or interconnected theme. The design is monochromatic and uses only blue as its color, which could be indicative of a specific brand or company. + +2. **Text Elements**: To the right of the geometric design, there are two lines of text. The first line reads "TONGYI" in a sans-serif font, with the "YI" part possibly being capitalized. The second line reads "Qwen" in a similar sans-serif font, but in a smaller size. + +The overall design is modern and minimalist, with a clear contrast between the geometric and textual elements. The use of blue for the geometric design could suggest themes of technology, connectivity, or innovation, which are common associations with the color blue in branding. The simplicity of the design makes it easily recognizable and memorable. +``` + +### Online Serving on Single NPU + +Run docker container to start the vLLM server on a single NPU: + +```{code-block} bash + :substitutions: + +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-e VLLM_USE_MODELSCOPE=True \ +-e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \ +-it $IMAGE \ +vllm serve Qwen/Qwen2.5-VL-7B-Instruct \ +--dtype bfloat16 \ +--max_model_len 16384 \ +--max-num-batched-tokens 16384 +``` + +:::{note} +Add `--max_model_len` option to avoid ValueError that the Qwen2.5-VL-7B-Instruct model's max seq len (128000) is larger than the maximum number of tokens that can be stored in KV cache. This will differ with different NPU series base on the HBM size. Please modify the value according to a suitable value for your NPU series. +::: + +If your service start successfully, you can see the info shown below: + +```bash +INFO: Started server process [2736] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once your server is started, you can query the model with input prompts: + +```bash +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}}, + {"type": "text", "text": "What is the text in the illustrate?"} + ]} + ] + }' +``` + +If you query the server successfully, you can see the info shown below (client): + +```bash +{"id":"chatcmpl-f04fb20e79bb40b39b8ed7fdf5bd613a","object":"chat.completion","created":1741749149,"model":"Qwen/Qwen2.5-VL-7B-Instruct","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"The text in the illustration reads \"TONGYI Qwen.\"","tool_calls":[]},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":74,"total_tokens":89,"completion_tokens":15,"prompt_tokens_details":null},"prompt_logprobs":null} +``` + +Logs of the vllm server: + +```bash +INFO 03-12 11:16:50 logger.py:39] Received request chatcmpl-92148a41eca64b6d82d3d7cfa5723aeb: prompt: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>\nWhat is the text in the illustrate?<|im_end|>\n<|im_start|>assistant\n', params: SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16353, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None), prompt_token_ids: None, lora_request: None, prompt_adapter_request: None. +INFO 03-12 11:16:50 engine.py:280] Added request chatcmpl-92148a41eca64b6d82d3d7cfa5723aeb. +INFO: 127.0.0.1:54004 - "POST /v1/chat/completions HTTP/1.1" 200 OK +``` diff --git a/docs/source/tutorials/single_npu_qwen3_embedding.md b/docs/source/tutorials/single_npu_qwen3_embedding.md new file mode 100644 index 0000000..b432509 --- /dev/null +++ b/docs/source/tutorials/single_npu_qwen3_embedding.md @@ -0,0 +1,99 @@ +# Single NPU (Qwen3-Embedding-8B) + +The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This guide describes how to run the model with vLLM Ascend. Note that only 0.9.2rc1 and higher versions of vLLM Ascend support the model. + +## Run docker container + +Take Qwen3-Embedding-8B model as an example, first run the docker container with the following command: + +```{code-block} bash + :substitutions: +# Update the vllm-ascend image +export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version| +docker run --rm \ +--name vllm-ascend \ +--device /dev/davinci0 \ +--device /dev/davinci_manager \ +--device /dev/devmm_svm \ +--device /dev/hisi_hdc \ +-v /usr/local/dcmi:/usr/local/dcmi \ +-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ +-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ +-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ +-v /etc/ascend_install.info:/etc/ascend_install.info \ +-v /root/.cache:/root/.cache \ +-p 8000:8000 \ +-it $IMAGE bash +``` + +Setup environment variables: + +```bash +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=True + +# Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 +``` + +### Online Inference + +```bash +vllm serve Qwen/Qwen3-Embedding-8B --task embed +``` + +Once your server is started, you can query the model with input prompts + +```bash +curl http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{ + "model": "Qwen/Qwen3-Embedding-8B", + "messages": [ + {"role": "user", "content": "Hello"} + ] +}' +``` + +### Offline Inference + +```python +import torch +import vllm +from vllm import LLM + +def get_detailed_instruct(task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + +if __name__=="__main__": + # Each query must come with a one-sentence instruction that describes the task + task = 'Given a web search query, retrieve relevant passages that answer the query' + + queries = [ + get_detailed_instruct(task, 'What is the capital of China?'), + get_detailed_instruct(task, 'Explain gravity') + ] + # No need to add instruction for retrieval documents + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." + ] + input_texts = queries + documents + + model = LLM(model="Qwen/Qwen3-Embedding-8B", + task="embed", + distributed_executor_backend="mp") + + outputs = model.embed(input_texts) + embeddings = torch.tensor([o.outputs.embedding for o in outputs]) + scores = (embeddings[:2] @ embeddings[2:].T) + print(scores.tolist()) +``` + +If you run this script successfully, you can see the info shown below: + +```bash +Adding requests: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 282.22it/s] +Processed prompts: 0%| | 0/4 [00:00 4 now. [#1508](https://github.com/vllm-project/vllm-ascend/issues/1508) +- MTP support torchair graph mode now [#2145](https://github.com/vllm-project/vllm-ascend/pull/2145) + +### Other + +- Bug fixes: + * Fix functional problem of multi-modality models like Qwen2-audio with Aclgraph. [#1803](https://github.com/vllm-project/vllm-ascend/pull/1803) + * Fix the process group creating error with external launch scenario. [#1681](https://github.com/vllm-project/vllm-ascend/pull/1681) + * Fix the functional problem with guided decoding. [#2022](https://github.com/vllm-project/vllm-ascend/pull/2022) + * Fix the accuracy issue with common MoE models in DP scenario. [#1856](https://github.com/vllm-project/vllm-ascend/pull/1856) +- Performance improved through a lot of prs: + * Caching sin/cos instead of calculate it every layer. [#1890](https://github.com/vllm-project/vllm-ascend/pull/1890) + * Improve shared expert multi-stream parallelism [#1891](https://github.com/vllm-project/vllm-ascend/pull/1891) + * Implement the fusion of allreduce and matmul in prefill phase when tp is enabled. Enable this feature by setting `VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE` to `1`. [#1926](https://github.com/vllm-project/vllm-ascend/pull/1926) + * Optimize Quantized MoE Performance by Reducing All2All Communication. [#2195](https://github.com/vllm-project/vllm-ascend/pull/2195) + * Use AddRmsNormQuant ops in the custom model to optimize Qwen3's performance [#1806](https://github.com/vllm-project/vllm-ascend/pull/1806) + * Use multicast to avoid padding decode request to prefill size [#1555](https://github.com/vllm-project/vllm-ascend/pull/1555) + * The performance of LoRA has been improved. [#1884](https://github.com/vllm-project/vllm-ascend/pull/1884) +- A batch of refactoring prs to enhance the code architecture: + * Torchair model runner refactor [#2205](https://github.com/vllm-project/vllm-ascend/pull/2205) + * Refactoring forward_context and model_runner_v1. [#1979](https://github.com/vllm-project/vllm-ascend/pull/1979) + * Refactor AscendMetaData Comments. [#1967](https://github.com/vllm-project/vllm-ascend/pull/1967) + * Refactor torchair utils. [#1892](https://github.com/vllm-project/vllm-ascend/pull/1892) + * Refactor torchair worker. [#1885](https://github.com/vllm-project/vllm-ascend/pull/1885) + * Register activation customop instead of overwrite forward_oot. [#1841](https://github.com/vllm-project/vllm-ascend/pull/1841) +- Parameters changes: + * `expert_tensor_parallel_size` in `additional_config` is removed now, and the EP and TP is aligned with vLLM now. [#1681](https://github.com/vllm-project/vllm-ascend/pull/1681) + * Add `VLLM_ASCEND_MLA_PA` in environ variables, use this to enable mla paged attention operator for deepseek mla decode. + * Add `VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE` in environ variables, enable `MatmulAllReduce` fusion kernel when tensor parallel is enabled. This feature is supported in A2, and eager mode will get better performance. + * Add `VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ` in environ variables, Whether to enable moe all2all seq, this provides a basic framework on the basis of alltoall for easy expansion. + +- UT coverage reached 76.34% after a batch of prs followed by this rfc: [#1298](https://github.com/vllm-project/vllm-ascend/issues/1298) +- Sequence Parallelism works for Qwen3 MoE. [#2209](https://github.com/vllm-project/vllm-ascend/issues/2209) +- Chinese online document is added now. [#1870](https://github.com/vllm-project/vllm-ascend/issues/1870) + +### Known Issues +- Aclgraph could not work with DP + EP currently, the mainly gap is the number of npu stream that Aclgraph needed to capture graph is not enough. [#2229](https://github.com/vllm-project/vllm-ascend/issues/2229) +- There is an accuracy issue on W8A8 dynamic quantized DeepSeek with multistream enabled. This will be fixed in the next release. [#2232](https://github.com/vllm-project/vllm-ascend/issues/2232) +- In Qwen3 MoE, SP cannot be incorporated into the Aclgraph. [#2246](https://github.com/vllm-project/vllm-ascend/issues/2246) +- MTP not support V1 scheduler currently, will fix it in Q3. [#2254](https://github.com/vllm-project/vllm-ascend/issues/2254) +- When running MTP with DP > 1, we need to disable metrics logger due to some issue on vLLM. [#2254](https://github.com/vllm-project/vllm-ascend/issues/2254) + +## v0.9.1rc2 - 2025.08.04 +This is the 2nd release candidate of v0.9.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/) to get started. + +### Highlights +- MOE and dense w4a8 quantization support now: [#1320](https://github.com/vllm-project/vllm-ascend/pull/1320) [#1910](https://github.com/vllm-project/vllm-ascend/pull/1910) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) [#1480](https://github.com/vllm-project/vllm-ascend/pull/1480) +- Dynamic EPLB support in [#1943](https://github.com/vllm-project/vllm-ascend/pull/1943) +- Disaggregated Prefilling support for V1 Engine and improvement, continued development and stabilization of the disaggregated prefill feature, including performance enhancements and bug fixes for single-machine setups:[#1953](https://github.com/vllm-project/vllm-ascend/pull/1953) [#1612](https://github.com/vllm-project/vllm-ascend/pull/1612) [#1361](https://github.com/vllm-project/vllm-ascend/pull/1361) [#1746](https://github.com/vllm-project/vllm-ascend/pull/1746) [#1552](https://github.com/vllm-project/vllm-ascend/pull/1552) [#1801](https://github.com/vllm-project/vllm-ascend/pull/1801) [#2083](https://github.com/vllm-project/vllm-ascend/pull/2083) [#1989](https://github.com/vllm-project/vllm-ascend/pull/1989) + +### Models improvement: +- DeepSeek DeepSeek DBO support and improvement: [#1285](https://github.com/vllm-project/vllm-ascend/pull/1285) [#1291](https://github.com/vllm-project/vllm-ascend/pull/1291) [#1328](https://github.com/vllm-project/vllm-ascend/pull/1328) [#1420](https://github.com/vllm-project/vllm-ascend/pull/1420) [#1445](https://github.com/vllm-project/vllm-ascend/pull/1445) [#1589](https://github.com/vllm-project/vllm-ascend/pull/1589) [#1759](https://github.com/vllm-project/vllm-ascend/pull/1759) [#1827](https://github.com/vllm-project/vllm-ascend/pull/1827) [#2093](https://github.com/vllm-project/vllm-ascend/pull/2093) +- DeepSeek MTP improvement and bugfix: [#1214](https://github.com/vllm-project/vllm-ascend/pull/1214) [#943](https://github.com/vllm-project/vllm-ascend/pull/943) [#1584](https://github.com/vllm-project/vllm-ascend/pull/1584) [#1473](https://github.com/vllm-project/vllm-ascend/pull/1473) [#1294](https://github.com/vllm-project/vllm-ascend/pull/1294) [#1632](https://github.com/vllm-project/vllm-ascend/pull/1632) [#1694](https://github.com/vllm-project/vllm-ascend/pull/1694) [#1840](https://github.com/vllm-project/vllm-ascend/pull/1840) [#2076](https://github.com/vllm-project/vllm-ascend/pull/2076) [#1990](https://github.com/vllm-project/vllm-ascend/pull/1990) [#2019](https://github.com/vllm-project/vllm-ascend/pull/2019) +- Qwen3 MoE support improvement and bugfix around graph mode and DP: [#1940](https://github.com/vllm-project/vllm-ascend/pull/1940) [#2006](https://github.com/vllm-project/vllm-ascend/pull/2006) [#1832](https://github.com/vllm-project/vllm-ascend/pull/1832) +- Qwen3 performance improvement around rmsnorm/repo/mlp ops: [#1545](https://github.com/vllm-project/vllm-ascend/pull/1545) [#1719](https://github.com/vllm-project/vllm-ascend/pull/1719) [#1726](https://github.com/vllm-project/vllm-ascend/pull/1726) [#1782](https://github.com/vllm-project/vllm-ascend/pull/1782) [#1745](https://github.com/vllm-project/vllm-ascend/pull/1745) +- DeepSeek MLA chunked prefill/graph mode/multistream improvement and bugfix: [#1240](https://github.com/vllm-project/vllm-ascend/pull/1240) [#933](https://github.com/vllm-project/vllm-ascend/pull/933) [#1135](https://github.com/vllm-project/vllm-ascend/pull/1135) [#1311](https://github.com/vllm-project/vllm-ascend/pull/1311) [#1750](https://github.com/vllm-project/vllm-ascend/pull/1750) [#1872](https://github.com/vllm-project/vllm-ascend/pull/1872) [#2170](https://github.com/vllm-project/vllm-ascend/pull/2170) [#1551](https://github.com/vllm-project/vllm-ascend/pull/1551) +- Qwen2.5 VL improvement via mrope/padding mechanism improvement: [#1261](https://github.com/vllm-project/vllm-ascend/pull/1261) [#1705](https://github.com/vllm-project/vllm-ascend/pull/1705) [#1929](https://github.com/vllm-project/vllm-ascend/pull/1929) [#2007](https://github.com/vllm-project/vllm-ascend/pull/2007) +- Ray: Fix the device error when using ray and add initialize_cache and improve warning info: [#1234](https://github.com/vllm-project/vllm-ascend/pull/1234) [#1501](https://github.com/vllm-project/vllm-ascend/pull/1501) + +### Graph mode improvement: +- Fix DeepSeek with deepseek with mc2 in [#1269](https://github.com/vllm-project/vllm-ascend/pull/1269) +- Fix accuracy problem for deepseek V3/R1 models with torchair graph in long sequence predictions in [#1332](https://github.com/vllm-project/vllm-ascend/pull/1332) +- Fix torchair_graph_batch_sizes bug in [#1570](https://github.com/vllm-project/vllm-ascend/pull/1570) +- Enable the limit of tp <= 4 for torchair graph mode in [#1404](https://github.com/vllm-project/vllm-ascend/pull/1404) +- Fix rope accruracy bug [#1887](https://github.com/vllm-project/vllm-ascend/pull/1887) +- Support multistream of shared experts in FusedMoE [#997](https://github.com/vllm-project/vllm-ascend/pull/997) +- Enable kvcache_nz for the decode process in torchair graph mode[#1098](https://github.com/vllm-project/vllm-ascend/pull/1098) +- Fix chunked-prefill with torchair case to resolve UnboundLocalError: local variable 'decode_hs_or_q_c' issue in [#1378](https://github.com/vllm-project/vllm-ascend/pull/1378) +- Improve shared experts multi-stream perf for w8a8 dynamic. in [#1561](https://github.com/vllm-project/vllm-ascend/pull/1561) +- Repair moe error when set multistream. in [#1882](https://github.com/vllm-project/vllm-ascend/pull/1882) +- Round up graph batch size to tp size in EP case [#1610](https://github.com/vllm-project/vllm-ascend/pull/1610) +- Fix torchair bug when DP is enabled in [#1727](https://github.com/vllm-project/vllm-ascend/pull/1727) +- Add extra checking to torchair_graph_config. in [#1675](https://github.com/vllm-project/vllm-ascend/pull/1675) +- Fix rope bug in torchair+chunk-prefill scenario in [#1693](https://github.com/vllm-project/vllm-ascend/pull/1693) +- torchair_graph bugfix when chunked_prefill is true in [#1748](https://github.com/vllm-project/vllm-ascend/pull/1748) +- Improve prefill optimization to support torchair graph mode in [#2090](https://github.com/vllm-project/vllm-ascend/pull/2090) +- Fix rank set in DP scenario [#1247](https://github.com/vllm-project/vllm-ascend/pull/1247) +- Reset all unused positions to prevent out-of-bounds to resolve GatherV3 bug in [#1397](https://github.com/vllm-project/vllm-ascend/pull/1397) +- Remove duplicate multimodal codes in ModelRunner in [#1393](https://github.com/vllm-project/vllm-ascend/pull/1393) +- Fix block table shape to resolve accuracy issue in [#1297](https://github.com/vllm-project/vllm-ascend/pull/1297) +- Implement primal full graph with limited scenario in [#1503](https://github.com/vllm-project/vllm-ascend/pull/1503) +- Restore paged attention kernel in Full Graph for performance in [#1677](https://github.com/vllm-project/vllm-ascend/pull/1677) +- Fix DeepSeek OOM issue in extreme `--gpu-memory-utilization` scenario in [#1829](https://github.com/vllm-project/vllm-ascend/pull/1829) +- Turn off aclgraph when enabling TorchAir in [#2154](https://github.com/vllm-project/vllm-ascend/pull/2154) + +### Ops improvement: +- add custom ascendc kernel vocabparallelembedding [#796](https://github.com/vllm-project/vllm-ascend/pull/796) +- fix rope sin/cos cache bug in [#1267](https://github.com/vllm-project/vllm-ascend/pull/1267) +- Refactoring AscendFusedMoE (#1229) in [#1264](https://github.com/vllm-project/vllm-ascend/pull/1264) +- Use fused ops npu_top_k_top_p in sampler [#1920](https://github.com/vllm-project/vllm-ascend/pull/1920) + +### Core: +- Upgrade CANN to 8.2.rc1 in [#2036](https://github.com/vllm-project/vllm-ascend/pull/2036) +- Upgrade torch-npu to 2.5.1.post1 in [#2135](https://github.com/vllm-project/vllm-ascend/pull/2135) +- Upgrade python to 3.11 in [#2136](https://github.com/vllm-project/vllm-ascend/pull/2136) +- Disable quantization in mindie_turbo in [#1749](https://github.com/vllm-project/vllm-ascend/pull/1749) +- fix v0 spec decode in [#1323](https://github.com/vllm-project/vllm-ascend/pull/1323) +- Enable `ACL_OP_INIT_MODE=1` directly only when using V0 spec decode in [#1271](https://github.com/vllm-project/vllm-ascend/pull/1271) +- Refactoring forward_context and model_runner_v1 in [#1422](https://github.com/vllm-project/vllm-ascend/pull/1422) +- Fix sampling params in [#1423](https://github.com/vllm-project/vllm-ascend/pull/1423) +- add a switch for enabling NZ layout in weights and enable NZ for GMM. in [#1409](https://github.com/vllm-project/vllm-ascend/pull/1409) +- Resolved bug in ascend_forward_context in [#1449](https://github.com/vllm-project/vllm-ascend/pull/1449) [#1554](https://github.com/vllm-project/vllm-ascend/pull/1554) [#1598](https://github.com/vllm-project/vllm-ascend/pull/1598) +- Address PrefillCacheHit state to fix prefix cache accuracy bug in [#1492](https://github.com/vllm-project/vllm-ascend/pull/1492) +- Fix load weight error and add new e2e case in [#1651](https://github.com/vllm-project/vllm-ascend/pull/1651) +- Optimize the number of rope-related index selections in deepseek. in [#1614](https://github.com/vllm-project/vllm-ascend/pull/1614) +- add mc2 mask in [#1642](https://github.com/vllm-project/vllm-ascend/pull/1642) +- Fix static EPLB log2phy condition and improve unit test in [#1667](https://github.com/vllm-project/vllm-ascend/pull/1667) [#1896](https://github.com/vllm-project/vllm-ascend/pull/1896) [#2003](https://github.com/vllm-project/vllm-ascend/pull/2003) +- add chunk mc2 for prefill in [#1703](https://github.com/vllm-project/vllm-ascend/pull/1703) +- Fix mc2 op GroupCoordinator bug in [#1711](https://github.com/vllm-project/vllm-ascend/pull/1711) +- Fix the failure to recognize the actual type of quantization in [#1721](https://github.com/vllm-project/vllm-ascend/pull/1721) +- Fix deepseek bug when tp_size == 1 in [#1755](https://github.com/vllm-project/vllm-ascend/pull/1755) +- Added support for delay-free blocks in prefill nodes in [#1691](https://github.com/vllm-project/vllm-ascend/pull/1691) +- Moe alltoallv communication optimization for unquantized RL training & alltoallv support dpo in [#1547](https://github.com/vllm-project/vllm-ascend/pull/1547) +- Adapt dispatchV2 interface in [#1822](https://github.com/vllm-project/vllm-ascend/pull/1822) +- Fix disaggregate prefill hang issue in long output in [#1807](https://github.com/vllm-project/vllm-ascend/pull/1807) +- Fix flashcomm_v1 when engine v0 in [#1859](https://github.com/vllm-project/vllm-ascend/pull/1859) +- ep_group is not equal to word_size in some cases. in [#1862](https://github.com/vllm-project/vllm-ascend/pull/1862) +- Fix wheel glibc version incompatibility in [#1808](https://github.com/vllm-project/vllm-ascend/pull/1808) +- Fix mc2 process group to resolve self.cpu_group is None in [#1831](https://github.com/vllm-project/vllm-ascend/pull/1831) +- Pin vllm version to v0.9.1 to make mypy check passed in [#1904](https://github.com/vllm-project/vllm-ascend/pull/1904) +- Apply npu_moe_gating_top_k_softmax for moe to improve perf in [#1902](https://github.com/vllm-project/vllm-ascend/pull/1902) +- Fix bug in path_decorator when engine v0 in [#1919](https://github.com/vllm-project/vllm-ascend/pull/1919) +- Avoid performing cpu all_reduce in disaggregated-prefill scenario. in [#1644](https://github.com/vllm-project/vllm-ascend/pull/1644) +- add super kernel in decode moe in [#1916](https://github.com/vllm-project/vllm-ascend/pull/1916) +- [Prefill Perf] Parallel Strategy Optimizations (VRAM-for-Speed Tradeoff) in [#1802](https://github.com/vllm-project/vllm-ascend/pull/1802) +- Remove unnecessary reduce_results access in shared_experts.down_proj in [#2016](https://github.com/vllm-project/vllm-ascend/pull/2016) +- Optimize greedy reject sampler with vectorization. in [#2002](https://github.com/vllm-project/vllm-ascend/pull/2002) +- Make multiple Ps and Ds work on a single machine in [#1936](https://github.com/vllm-project/vllm-ascend/pull/1936) +- Fixes the shape conflicts between shared & routed experts for deepseek model when tp > 1 and multistream_moe enabled in [#2075](https://github.com/vllm-project/vllm-ascend/pull/2075) +- Add cpu binding support [#2031](https://github.com/vllm-project/vllm-ascend/pull/2031) +- Add with_prefill cpu allreduce to handle D-node recomputatio in [#2129](https://github.com/vllm-project/vllm-ascend/pull/2129) +- Add D2H & initRoutingQuantV2 to improve prefill perf in [#2038](https://github.com/vllm-project/vllm-ascend/pull/2038) + +### Docs: +- Provide an e2e guide for execute duration profiling [#1113](https://github.com/vllm-project/vllm-ascend/pull/1113) +- Add Referer header for CANN package download url. [#1192](https://github.com/vllm-project/vllm-ascend/pull/1192) +- Add reinstall instructions doc [#1370](https://github.com/vllm-project/vllm-ascend/pull/1370) +- Update Disaggregate prefill README [#1379](https://github.com/vllm-project/vllm-ascend/pull/1379) +- Disaggregate prefill for kv cache register style [#1296](https://github.com/vllm-project/vllm-ascend/pull/1296) +- Fix errors and non-standard parts in examples/disaggregate_prefill_v1/README.md in [#1965](https://github.com/vllm-project/vllm-ascend/pull/1965) + +### Known Issues +- Full graph mode support are not yet available for specific hardware types with full_cuda_graphenable. [#2182](https://github.com/vllm-project/vllm-ascend/issues/2182) +- Qwen3 MoE aclgraph mode with tp failed when enable ep due to bincount error [#2226](https://github.com/vllm-project/vllm-ascend/issues/2226) +- As mentioend in v0.9.1rc1 release note, Altlas 300I series support will NOT be included. + +## v0.9.2rc1 - 2025.07.11 + +This is the 1st release candidate of v0.9.2 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started. From this release, V1 engine will be enabled by default, there is no need to set `VLLM_USE_V1=1` any more. And this release is the last version to support V0 engine, V0 code will be clean up in the future. + +### Highlights +- Pooling model works with V1 engine now. You can take a try with Qwen3 embedding model [#1359](https://github.com/vllm-project/vllm-ascend/pull/1359). +- The performance on Atlas 300I series has been improved. [#1591](https://github.com/vllm-project/vllm-ascend/pull/1591) +- aclgraph mode works with Moe models now. Currently, only Qwen3 Moe is well tested. [#1381](https://github.com/vllm-project/vllm-ascend/pull/1381) + +### Core +- Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250619`. Don’t forget to update it in your environment. [#1347](https://github.com/vllm-project/vllm-ascend/pull/1347) +- The GatherV3 error has been fixed with aclgraph mode. [#1416](https://github.com/vllm-project/vllm-ascend/pull/1416) +- W8A8 quantization works on Atlas 300I series now. [#1560](https://github.com/vllm-project/vllm-ascend/pull/1560) +- Fix the accuracy problem with deploy models with parallel parameters. [#1678](https://github.com/vllm-project/vllm-ascend/pull/1678) +- The pre-built wheel package now requires lower version of glibc. Users can use it by `pip install vllm-ascend` directly. [#1582](https://github.com/vllm-project/vllm-ascend/pull/1582) + +### Other +- Official doc has been updated for better read experience. For example, more deployment tutorials are added, user/developer docs are updated. More guide will coming soon. +- Fix accuracy problem for deepseek V3/R1 models with torchair graph in long sequence predictions. [#1331](https://github.com/vllm-project/vllm-ascend/pull/1331) +- A new env variable `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` has been added. It enables the fused allgather-experts kernel for Deepseek V3/R1 models. The default value is `0`. [#1335](https://github.com/vllm-project/vllm-ascend/pull/1335) +- A new env variable `VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION` has been added to improve the performance of topk-topp sampling. The default value is 0, we'll consider to enable it by default in the future[#1732](https://github.com/vllm-project/vllm-ascend/pull/1732) +- A batch of bugs have been fixed for Data Parallelism case [#1273](https://github.com/vllm-project/vllm-ascend/pull/1273) [#1322](https://github.com/vllm-project/vllm-ascend/pull/1322) [#1275](https://github.com/vllm-project/vllm-ascend/pull/1275) [#1478](https://github.com/vllm-project/vllm-ascend/pull/1478) +- The DeepSeek performance has been improved. [#1194](https://github.com/vllm-project/vllm-ascend/pull/1194) [#1395](https://github.com/vllm-project/vllm-ascend/pull/1395) [#1380](https://github.com/vllm-project/vllm-ascend/pull/1380) +- Ascend scheduler works with prefix cache now. [#1446](https://github.com/vllm-project/vllm-ascend/pull/1446) +- DeepSeek now works with prefix cache now. [#1498](https://github.com/vllm-project/vllm-ascend/pull/1498) +- Support prompt logprobs to recover ceval accuracy in V1 [#1483](https://github.com/vllm-project/vllm-ascend/pull/1483) + +### Known Issues + +- Pipeline parallel does not work with ray and graph mode: https://github.com/vllm-project/vllm-ascend/issues/1751 https://github.com/vllm-project/vllm-ascend/issues/1754 + +### New Contributors +- @xleoken made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1357 +- @lyj-jjj made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1335 +- @sharonyunyun made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1194 +- @Pr0Wh1teGivee made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1308 +- @leo-pony made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1374 +- @zeshengzong made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1452 +- @GDzhu01 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1477 +- @Agonixiaoxiao made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1531 +- @zhanghw0354 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1476 +- @farawayboat made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1591 +- @ZhengWG made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1196 +- @wm901115nwpu made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1654 + +**Full Changelog**: https://github.com/vllm-project/vllm-ascend/compare/v0.9.1rc1...v0.9.2rc1 + +## v0.9.1rc1 - 2025.06.22 + +This is the 1st release candidate of v0.9.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started. + +### Experimental + +- Atlas 300I series is experimental supported in this release (Functional test passed with Qwen2.5-7b-instruct/Qwen2.5-0.5b/Qwen3-0.6B/Qwen3-4B/Qwen3-8B). [#1333](https://github.com/vllm-project/vllm-ascend/pull/1333) +- Support EAGLE-3 for speculative decoding. [#1032](https://github.com/vllm-project/vllm-ascend/pull/1032) + +After careful consideration, above features **will NOT be included in v0.9.1-dev branch (v0.9.1 final release)** taking into account the v0.9.1 release quality and the feature rapid iteration. We will improve this from 0.9.2rc1 and later. + +### Core +- Ascend PyTorch adapter (torch_npu) has been upgraded to `2.5.1.post1.dev20250528`. Don’t forget to update it in your environment. [#1235](https://github.com/vllm-project/vllm-ascend/pull/1235) +- Support Atlas 300I series container image. You can get it from [quay.io](https://quay.io/repository/vllm/vllm-ascend) +- Fix token-wise padding mechanism to make multi-card graph mode work. [#1300](https://github.com/vllm-project/vllm-ascend/pull/1300) +- Upgrade vLLM to 0.9.1 [#1165](https://github.com/vllm-project/vllm-ascend/pull/1165) + +### Other Improvements +- Initial support Chunked Prefill for MLA. [#1172](https://github.com/vllm-project/vllm-ascend/pull/1172) +- An example of best practices to run DeepSeek with ETP has been added. [#1101](https://github.com/vllm-project/vllm-ascend/pull/1101) +- Performance improvements for DeepSeek using the TorchAir graph. [#1098](https://github.com/vllm-project/vllm-ascend/pull/1098), [#1131](https://github.com/vllm-project/vllm-ascend/pull/1131) +- Supports the speculative decoding feature with AscendScheduler. [#943](https://github.com/vllm-project/vllm-ascend/pull/943) +- Improve `VocabParallelEmbedding` custom op performance. It will be enabled in the next release. [#796](https://github.com/vllm-project/vllm-ascend/pull/796) +- Fixed a device discovery and setup bug when running vLLM Ascend on Ray [#884](https://github.com/vllm-project/vllm-ascend/pull/884) +- DeepSeek with [MC2](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/ascendcbestP/atlas_ascendc_best_practices_10_0043.html) (Merged Compute and Communication) now works properly. [#1268](https://github.com/vllm-project/vllm-ascend/pull/1268) +- Fixed log2phy NoneType bug with static EPLB feature. [#1186](https://github.com/vllm-project/vllm-ascend/pull/1186) +- Improved performance for DeepSeek with DBO enabled. [#997](https://github.com/vllm-project/vllm-ascend/pull/997), [#1135](https://github.com/vllm-project/vllm-ascend/pull/1135) +- Refactoring AscendFusedMoE [#1229](https://github.com/vllm-project/vllm-ascend/pull/1229) +- Add initial user stories page (include LLaMA-Factory/TRL/verl/MindIE Turbo/GPUStack) [#1224](https://github.com/vllm-project/vllm-ascend/pull/1224) +- Add unit test framework [#1201](https://github.com/vllm-project/vllm-ascend/pull/1201) + +### Known Issues +- In some cases, the vLLM process may crash with a **GatherV3** error when **aclgraph** is enabled. We are working on this issue and will fix it in the next release. [#1038](https://github.com/vllm-project/vllm-ascend/issues/1038) +- Prefix cache feature does not work with the Ascend Scheduler but without chunked prefill enabled. This will be fixed in the next release. [#1350](https://github.com/vllm-project/vllm-ascend/issues/1350) + +### Full Changelog +https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1 + +### New Contributors +- @farawayboat made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1333 +- @yzim made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1159 +- @chenwaner made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1098 +- @wangyanhui-cmss made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1184 +- @songshanhu07 made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1186 +- @yuancaoyaoHW made their first contribution in https://github.com/vllm-project/vllm-ascend/pull/1032 + +**Full Changelog**: https://github.com/vllm-project/vllm-ascend/compare/v0.9.0rc2...v0.9.1rc1 + +## v0.9.0rc2 - 2025.06.10 + +This release contains some quick fixes for v0.9.0rc1. Please use this release instead of v0.9.0rc1. + +### Highlights + +- Fix the import error when vllm-ascend is installed without editable way. [#1152](https://github.com/vllm-project/vllm-ascend/pull/1152) + +## v0.9.0rc1 - 2025.06.09 + +This is the 1st release candidate of v0.9.0 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this release, V1 Engine is recommended to use. The code of V0 Engine is frozen and will not be maintained any more. Please set environment `VLLM_USE_V1=1` to enable V1 Engine. + +### Highlights + +- DeepSeek works with graph mode now. Follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/user_guide/feature_guide/graph_mode.html) to take a try. [#789](https://github.com/vllm-project/vllm-ascend/pull/789) +- Qwen series models works with graph mode now. It works by default with V1 Engine. Please note that in this release, only Qwen series models are well tested with graph mode. We'll make it stable and generalize in the next release. If you hit any issues, please feel free to open an issue on GitHub and fallback to eager mode temporarily by set `enforce_eager=True` when initializing the model. + +### Core + +- The performance of multi-step scheduler has been improved. Thanks for the contribution from China Merchants Bank. [#814](https://github.com/vllm-project/vllm-ascend/pull/814) +- LoRA、Multi-LoRA And Dynamic Serving is supported for V1 Engine now. Thanks for the contribution from China Merchants Bank. [#893](https://github.com/vllm-project/vllm-ascend/pull/893) +- Prefix cache and chunked prefill feature works now [#782](https://github.com/vllm-project/vllm-ascend/pull/782) [#844](https://github.com/vllm-project/vllm-ascend/pull/844) +- Spec decode and MTP features work with V1 Engine now. [#874](https://github.com/vllm-project/vllm-ascend/pull/874) [#890](https://github.com/vllm-project/vllm-ascend/pull/890) +- DP feature works with DeepSeek now. [#1012](https://github.com/vllm-project/vllm-ascend/pull/1012) +- Input embedding feature works with V0 Engine now. [#916](https://github.com/vllm-project/vllm-ascend/pull/916) +- Sleep mode feature works with V1 Engine now. [#1084](https://github.com/vllm-project/vllm-ascend/pull/1084) + +### Model + +- Qwen2.5 VL works with V1 Engine now. [#736](https://github.com/vllm-project/vllm-ascend/pull/736) +- LLama4 works now. [#740](https://github.com/vllm-project/vllm-ascend/pull/740) +- A new kind of DeepSeek model called dual-batch overlap(DBO) is added. Please set `VLLM_ASCEND_ENABLE_DBO=1` to use it. [#941](https://github.com/vllm-project/vllm-ascend/pull/941) + +### Other + +- online serve with ascend quantization works now. [#877](https://github.com/vllm-project/vllm-ascend/pull/877) +- A batch of bugs for graph mode and moe model have been fixed. [#773](https://github.com/vllm-project/vllm-ascend/pull/773) [#771](https://github.com/vllm-project/vllm-ascend/pull/771) [#774](https://github.com/vllm-project/vllm-ascend/pull/774) [#816](https://github.com/vllm-project/vllm-ascend/pull/816) [#817](https://github.com/vllm-project/vllm-ascend/pull/817) [#819](https://github.com/vllm-project/vllm-ascend/pull/819) [#912](https://github.com/vllm-project/vllm-ascend/pull/912) [#897](https://github.com/vllm-project/vllm-ascend/pull/897) [#961](https://github.com/vllm-project/vllm-ascend/pull/961) [#958](https://github.com/vllm-project/vllm-ascend/pull/958) [#913](https://github.com/vllm-project/vllm-ascend/pull/913) [#905](https://github.com/vllm-project/vllm-ascend/pull/905) +- A batch of performance improvement PRs have been merged. [#784](https://github.com/vllm-project/vllm-ascend/pull/784) [#803](https://github.com/vllm-project/vllm-ascend/pull/803) [#966](https://github.com/vllm-project/vllm-ascend/pull/966) [#839](https://github.com/vllm-project/vllm-ascend/pull/839) [#970](https://github.com/vllm-project/vllm-ascend/pull/970) [#947](https://github.com/vllm-project/vllm-ascend/pull/947) [#987](https://github.com/vllm-project/vllm-ascend/pull/987) [#1085](https://github.com/vllm-project/vllm-ascend/pull/1085) +- From this release, binary wheel package will be released as well. [#775](https://github.com/vllm-project/vllm-ascend/pull/775) +- The contributor doc site is [added](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) + +### Known Issue + +- In some case, vLLM process may be crashed with aclgraph enabled. We're working this issue and it'll be fixed in the next release. +- Multi node data-parallel doesn't work with this release. This is a known issue in vllm and has been fixed on main branch. [#18981](https://github.com/vllm-project/vllm/pull/18981) + +## v0.7.3.post1 - 2025.05.29 + +This is the first post release of 0.7.3. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. It includes the following changes: + +### Highlights + +- Qwen3 and Qwen3MOE is supported now. The performance and accuracy of Qwen3 is well tested. You can try it now. Mindie Turbo is recomanded to improve the performance of Qwen3. [#903](https://github.com/vllm-project/vllm-ascend/pull/903) [#915](https://github.com/vllm-project/vllm-ascend/pull/915) +- Added a new performance guide. The guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. [#878](https://github.com/vllm-project/vllm-ascend/pull/878) [Doc Link](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/developer_guide/performance/optimization_and_tuning.html) + +### Bug Fix + +- Qwen2.5-VL works for RLHF scenarios now. [#928](https://github.com/vllm-project/vllm-ascend/pull/928) +- Users can launch the model from online weights now. e.g. from huggingface or modelscope directly [#858](https://github.com/vllm-project/vllm-ascend/pull/858) [#918](https://github.com/vllm-project/vllm-ascend/pull/918) +- The meaningless log info `UserWorkspaceSize0` has been cleaned. [#911](https://github.com/vllm-project/vllm-ascend/pull/911) +- The log level for `Failed to import vllm_ascend_C` has been changed to `warning` instead of `error`. [#956](https://github.com/vllm-project/vllm-ascend/pull/956) +- DeepSeek MLA now works with chunked prefill in V1 Engine. Please note that V1 engine in 0.7.3 is just expermential and only for test usage. [#849](https://github.com/vllm-project/vllm-ascend/pull/849) [#936](https://github.com/vllm-project/vllm-ascend/pull/936) + +### Docs + +- The benchmark doc is updated for Qwen2.5 and Qwen2.5-VL [#792](https://github.com/vllm-project/vllm-ascend/pull/792) +- Add the note to clear that only "modelscope<1.23.0" works with 0.7.3. [#954](https://github.com/vllm-project/vllm-ascend/pull/954) + +## v0.7.3 - 2025.05.08 + +🎉 Hello, World! + +We are excited to announce the release of 0.7.3 for vllm-ascend. This is the first official release. The functionality, performance, and stability of this release are fully tested and verified. We encourage you to try it out and provide feedback. We'll post bug fix versions in the future if needed. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. + +### Highlights +- This release includes all features landed in the previous release candidates ([v0.7.1rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.1rc1), [v0.7.3rc1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc1), [v0.7.3rc2](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3rc2)). And all the features are fully tested and verified. Visit the official doc the get the detail [feature](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/suppoted_features.html) and [model](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/user_guide/supported_models.html) support matrix. +- Upgrade CANN to 8.1.RC1 to enable chunked prefill and automatic prefix caching features. You can now enable them now. +- Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#662](https://github.com/vllm-project/vllm-ascend/pull/662) +- Integrate MindIE Turbo into vLLM Ascend to improve DeepSeek V3/R1, Qwen 2 series performance. [#708](https://github.com/vllm-project/vllm-ascend/pull/708) + +### Core +- LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the official doc for more usage information. Thanks for the contribution from China Merchants Bank. [#700](https://github.com/vllm-project/vllm-ascend/pull/700) + +### Model +- The performance of Qwen2 vl and Qwen2.5 vl is improved. [#702](https://github.com/vllm-project/vllm-ascend/pull/702) +- The performance of `apply_penalties` and `topKtopP` ops are improved. [#525](https://github.com/vllm-project/vllm-ascend/pull/525) + +### Other +- Fixed a issue that may lead CPU memory leak. [#691](https://github.com/vllm-project/vllm-ascend/pull/691) [#712](https://github.com/vllm-project/vllm-ascend/pull/712) +- A new environment `SOC_VERSION` is added. If you hit any soc detection error when building with custom ops enabled, please set `SOC_VERSION` to a suitable value. [#606](https://github.com/vllm-project/vllm-ascend/pull/606) +- openEuler container image supported with v0.7.3-openeuler tag. [#665](https://github.com/vllm-project/vllm-ascend/pull/665) +- Prefix cache feature works on V1 engine now. [#559](https://github.com/vllm-project/vllm-ascend/pull/559) + +## v0.8.5rc1 - 2025.05.06 + +This is the 1st release candidate of v0.8.5 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Now you can enable V1 egnine by setting the environment variable `VLLM_USE_V1=1`, see the feature support status of vLLM Ascend in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html). + +### Highlights +- Upgrade CANN version to 8.1.RC1 to support chunked prefill and automatic prefix caching (`--enable_prefix_caching`) when V1 is enabled [#747](https://github.com/vllm-project/vllm-ascend/pull/747) +- Optimize Qwen2 VL and Qwen 2.5 VL [#701](https://github.com/vllm-project/vllm-ascend/pull/701) +- Improve Deepseek V3 eager mode and graph mode performance, now you can use --additional_config={'enable_graph_mode': True} to enable graph mode. [#598](https://github.com/vllm-project/vllm-ascend/pull/598) [#719](https://github.com/vllm-project/vllm-ascend/pull/719) + +### Core +- Upgrade vLLM to 0.8.5.post1 [#715](https://github.com/vllm-project/vllm-ascend/pull/715) +- Fix early return in CustomDeepseekV2MoE.forward during profile_run [#682](https://github.com/vllm-project/vllm-ascend/pull/682) +- Adapts for new quant model generated by modelslim [#719](https://github.com/vllm-project/vllm-ascend/pull/719) +- Initial support on P2P Disaggregated Prefill based on llm_datadist [#694](https://github.com/vllm-project/vllm-ascend/pull/694) +- Use `/vllm-workspace` as code path and include `.git` in container image to fix issue when start vllm under `/workspace` [#726](https://github.com/vllm-project/vllm-ascend/pull/726) +- Optimize NPU memory usage to make DeepSeek R1 W8A8 32K model len work. [#728](https://github.com/vllm-project/vllm-ascend/pull/728) +- Fix `PYTHON_INCLUDE_PATH` typo in setup.py [#762](https://github.com/vllm-project/vllm-ascend/pull/762) + +### Other +- Add Qwen3-0.6B test [#717](https://github.com/vllm-project/vllm-ascend/pull/717) +- Add nightly CI [#668](https://github.com/vllm-project/vllm-ascend/pull/668) +- Add accuracy test report [#542](https://github.com/vllm-project/vllm-ascend/pull/542) + +## v0.8.4rc2 - 2025.04.29 + +This is the second release candidate of v0.8.4 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. Some experimental features are included in this version, such as W8A8 quantization and EP/DP support. We'll make them stable enough in the next release. + +### Highlights +- Qwen3 and Qwen3MOE is supported now. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/single_npu.html) to run the quick demo. [#709](https://github.com/vllm-project/vllm-ascend/pull/709) +- Ascend W8A8 quantization method is supported now. Please take the [official doc](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_quantization.html) for example. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/619) is welcome. [#580](https://github.com/vllm-project/vllm-ascend/pull/580) +- DeepSeek V3/R1 works with DP, TP and MTP now. Please note that it's still in experimental status. Let us know if you hit any problem. [#429](https://github.com/vllm-project/vllm-ascend/pull/429) [#585](https://github.com/vllm-project/vllm-ascend/pull/585) [#626](https://github.com/vllm-project/vllm-ascend/pull/626) [#636](https://github.com/vllm-project/vllm-ascend/pull/636) [#671](https://github.com/vllm-project/vllm-ascend/pull/671) + +### Core +- ACLGraph feature is supported with V1 engine now. It's disabled by default because this feature rely on CANN 8.1 release. We'll make it available by default in the next release [#426](https://github.com/vllm-project/vllm-ascend/pull/426) +- Upgrade PyTorch to 2.5.1. vLLM Ascend no longer relies on the dev version of torch-npu now. Now users don't need to install the torch-npu by hand. The 2.5.1 version of torch-npu will be installed automatically. [#661](https://github.com/vllm-project/vllm-ascend/pull/661) + +### Other +- MiniCPM model works now. [#645](https://github.com/vllm-project/vllm-ascend/pull/645) +- openEuler container image supported with `v0.8.4-openeuler` tag and customs Ops build is enabled by default for openEuler OS. [#689](https://github.com/vllm-project/vllm-ascend/pull/689) +- Fix ModuleNotFoundError bug to make Lora work [#600](https://github.com/vllm-project/vllm-ascend/pull/600) +- Add "Using EvalScope evaluation" doc [#611](https://github.com/vllm-project/vllm-ascend/pull/611) +- Add a `VLLM_VERSION` environment to make vLLM version configurable to help developer set correct vLLM version if the code of vLLM is changed by hand locally. [#651](https://github.com/vllm-project/vllm-ascend/pull/651) + +## v0.8.4rc1 - 2025.04.18 + +This is the first release candidate of v0.8.4 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to start the journey. From this version, vllm-ascend will follow the newest version of vllm and release every two weeks. For example, if vllm releases v0.8.5 in the next two weeks, vllm-ascend will release v0.8.5rc1 instead of v0.8.4rc2. Please find the detail from the [official documentation](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#release-window). + +### Highlights + +- vLLM V1 engine experimental support is included in this version. You can visit [official guide](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html) to get more detail. By default, vLLM will fallback to V0 if V1 doesn't work, please set `VLLM_USE_V1=1` environment if you want to use V1 forcely. +- LoRA、Multi-LoRA And Dynamic Serving is supported now. The performance will be improved in the next release. Please follow the [official doc](https://docs.vllm.ai/en/latest/features/lora.html) for more usage information. Thanks for the contribution from China Merchants Bank. [#521](https://github.com/vllm-project/vllm-ascend/pull/521). +- Sleep Mode feature is supported. Currently it's only work on V0 engine. V1 engine support will come soon. [#513](https://github.com/vllm-project/vllm-ascend/pull/513) + +### Core + +- The Ascend scheduler is added for V1 engine. This scheduler is more affinity with Ascend hardware. More scheduler policy will be added in the future. [#543](https://github.com/vllm-project/vllm-ascend/pull/543) +- Disaggregated Prefill feature is supported. Currently only 1P1D works. NPND is under design by vllm team. vllm-ascend will support it once it's ready from vLLM. Follow the [official guide](https://docs.vllm.ai/en/latest/features/disagg_prefill.html) to use. [#432](https://github.com/vllm-project/vllm-ascend/pull/432) +- Spec decode feature works now. Currently it's only work on V0 engine. V1 engine support will come soon. [#500](https://github.com/vllm-project/vllm-ascend/pull/500) +- Structured output feature works now on V1 Engine. Currently it only supports xgrammar backend while using guidance backend may get some errors. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) + +### Other + +- A new communicator `pyhccl` is added. It's used for call CANN HCCL library directly instead of using `torch.distribute`. More usage of it will be added in the next release [#503](https://github.com/vllm-project/vllm-ascend/pull/503) +- The custom ops build is enabled by default. You should install the packages like `gcc`, `cmake` first to build `vllm-ascend` from source. Set `COMPILE_CUSTOM_KERNELS=0` environment to disable the compilation if you don't need it. [#466](https://github.com/vllm-project/vllm-ascend/pull/466) +- The custom op `rotay embedding` is enabled by default now to improve the performance. [#555](https://github.com/vllm-project/vllm-ascend/pull/555) + +## v0.7.3rc2 - 2025.03.29 + +This is 2nd release candidate of v0.7.3 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. +- Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html +- Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html + +### Highlights +- Add Ascend Custom Ops framewrok. Developers now can write customs ops using AscendC. An example ops `rotary_embedding` is added. More tutorials will come soon. The Custom Ops compilation is disabled by default when installing vllm-ascend. Set `COMPILE_CUSTOM_KERNELS=1` to enable it. [#371](https://github.com/vllm-project/vllm-ascend/pull/371) +- V1 engine is basic supported in this release. The full support will be done in 0.8.X release. If you hit any issue or have any requirement of V1 engine. Please tell us [here](https://github.com/vllm-project/vllm-ascend/issues/414). [#376](https://github.com/vllm-project/vllm-ascend/pull/376) +- Prefix cache feature works now. You can set `enable_prefix_caching=True` to enable it. [#282](https://github.com/vllm-project/vllm-ascend/pull/282) + +### Core +- Bump torch_npu version to dev20250320.3 to improve accuracy to fix `!!!` output problem. [#406](https://github.com/vllm-project/vllm-ascend/pull/406) + +### Model +- The performance of Qwen2-vl is improved by optimizing patch embedding (Conv3D). [#398](https://github.com/vllm-project/vllm-ascend/pull/398) + +### Other + +- Fixed a bug to make sure multi step scheduler feature work. [#349](https://github.com/vllm-project/vllm-ascend/pull/349) +- Fixed a bug to make prefix cache feature works with correct accuracy. [#424](https://github.com/vllm-project/vllm-ascend/pull/424) + +## v0.7.3rc1 - 2025.03.14 + +🎉 Hello, World! This is the first release candidate of v0.7.3 for vllm-ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev) to start the journey. +- Quickstart with container: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/quick_start.html +- Installation: https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/installation.html + +### Highlights +- DeepSeek V3/R1 works well now. Read the [official guide](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/multi_node.html) to start! [#242](https://github.com/vllm-project/vllm-ascend/pull/242) +- Speculative decoding feature is supported. [#252](https://github.com/vllm-project/vllm-ascend/pull/252) +- Multi step scheduler feature is supported. [#300](https://github.com/vllm-project/vllm-ascend/pull/300) + +### Core +- Bump torch_npu version to dev20250308.3 to improve `_exponential` accuracy +- Added initial support for pooling models. Bert based model, such as `BAAI/bge-base-en-v1.5` and `BAAI/bge-reranker-v2-m3` works now. [#229](https://github.com/vllm-project/vllm-ascend/pull/229) + +### Model +- The performance of Qwen2-VL is improved. [#241](https://github.com/vllm-project/vllm-ascend/pull/241) +- MiniCPM is now supported [#164](https://github.com/vllm-project/vllm-ascend/pull/164) + +### Other +- Support MTP(Multi-Token Prediction) for DeepSeek V3/R1 [#236](https://github.com/vllm-project/vllm-ascend/pull/236) +- [Docs] Added more model tutorials, include DeepSeek, QwQ, Qwen and Qwen 2.5VL. See the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.3-dev/tutorials/index.html) for detail +- Pin modelscope<1.23.0 on vLLM v0.7.3 to resolve: https://github.com/vllm-project/vllm/pull/13807 + +### Known issues +- In [some cases](https://github.com/vllm-project/vllm-ascend/issues/324), especially when the input/output is very long, the accuracy of output may be incorrect. We are working on it. It'll be fixed in the next release. +- Improved and reduced the garbled code in model output. But if you still hit the issue, try to change the generation config value, such as `temperature`, and try again. There is also a knonwn issue shown below. Any [feedback](https://github.com/vllm-project/vllm-ascend/issues/267) is welcome. [#277](https://github.com/vllm-project/vllm-ascend/pull/277) + +## v0.7.1rc1 - 2025.02.19 + +🎉 Hello, World! + +We are excited to announce the first release candidate of v0.7.1 for vllm-ascend. + +vLLM Ascend Plugin (vllm-ascend) is a community maintained hardware plugin for running vLLM on the Ascend NPU. With this release, users can now enjoy the latest features and improvements of vLLM on the Ascend NPU. + +Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/v0.7.1-dev) to start the journey. Note that this is a release candidate, and there may be some bugs or issues. We appreciate your feedback and suggestions [here](https://github.com/vllm-project/vllm-ascend/issues/19) + +### Highlights + +- Initial supports for Ascend NPU on vLLM. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- DeepSeek is now supported. [#88](https://github.com/vllm-project/vllm-ascend/pull/88) [#68](https://github.com/vllm-project/vllm-ascend/pull/68) +- Qwen, Llama series and other popular models are also supported, you can see more details in [here](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html). + +### Core + +- Added the Ascend quantization config option, the implementation will coming soon. [#7](https://github.com/vllm-project/vllm-ascend/pull/7) [#73](https://github.com/vllm-project/vllm-ascend/pull/73) +- Add silu_and_mul and rope ops and add mix ops into attention layer. [#18](https://github.com/vllm-project/vllm-ascend/pull/18) + +### Other + +- [CI] Enable Ascend CI to actively monitor and improve quality for vLLM on Ascend. [#3](https://github.com/vllm-project/vllm-ascend/pull/3) +- [Docker] Add vllm-ascend container image [#64](https://github.com/vllm-project/vllm-ascend/pull/64) +- [Docs] Add a [live doc](https://vllm-ascend.readthedocs.org) [#55](https://github.com/vllm-project/vllm-ascend/pull/55) + +### Known issues + +- This release relies on an unreleased torch_npu version. It has been installed within official container image already. Please [install](https://vllm-ascend.readthedocs.io/en/v0.7.1rc1/installation.html) it manually if you are using non-container environment. +- There are logs like `No platform detected, vLLM is running on UnspecifiedPlatform` or `Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")` shown when running vllm-ascend. It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/12432) which will be included in v0.7.3 soon. +- There are logs like `# CPU blocks: 35064, # CPU blocks: 2730` shown when running vllm-ascend which should be `# NPU blocks:` . It actually doesn't affect any functionality and performance. You can just ignore it. And it has been fixed in this [PR](https://github.com/vllm-project/vllm/pull/13378) which will be included in v0.7.3 soon. diff --git a/docs/source/user_guide/support_matrix/index.md b/docs/source/user_guide/support_matrix/index.md new file mode 100644 index 0000000..3a8bf68 --- /dev/null +++ b/docs/source/user_guide/support_matrix/index.md @@ -0,0 +1,10 @@ +# Features and models + +This section provides a detailed supported matrix by vLLM Ascend. + +:::{toctree} +:caption: Support Matrix +:maxdepth: 1 +supported_models +supported_features +::: diff --git a/docs/source/user_guide/support_matrix/supported_features.md b/docs/source/user_guide/support_matrix/supported_features.md new file mode 100644 index 0000000..d3585d2 --- /dev/null +++ b/docs/source/user_guide/support_matrix/supported_features.md @@ -0,0 +1,45 @@ +# Feature Support + +The feature support principle of vLLM Ascend is: **aligned with the vLLM**. We are also actively collaborating with the community to accelerate support. + +You can check the [support status of vLLM V1 Engine][v1_user_guide]. Below is the feature support status of vLLM Ascend: + +| Feature | Status | Next Step | +|-------------------------------|----------------|------------------------------------------------------------------------| +| Chunked Prefill | 🟢 Functional | Functional, see detail note: [Chunked Prefill][cp] | +| Automatic Prefix Caching | 🟢 Functional | Functional, see detail note: [vllm-ascend#732][apc] | +| LoRA | 🟢 Functional | [vllm-ascend#396][multilora], [vllm-ascend#893][v1 multilora] | +| Speculative decoding | 🟢 Functional | Basic support | +| Pooling | 🟢 Functional | CI needed and adapting more models; V1 support rely on vLLM support. | +| Enc-dec | 🟡 Planned | vLLM should support this feature first. | +| Multi Modality | 🟢 Functional | [Tutorial][multimodal], optimizing and adapting more models | +| LogProbs | 🟢 Functional | CI needed | +| Prompt logProbs | 🟢 Functional | CI needed | +| Async output | 🟢 Functional | CI needed | +| Beam search | 🟢 Functional | CI needed | +| Guided Decoding | 🟢 Functional | [vllm-ascend#177][guided_decoding] | +| Tensor Parallel | 🟢 Functional | Make TP >4 work with graph mode | +| Pipeline Parallel | 🟢 Functional | Write official guide and tutorial. | +| Expert Parallel | 🟢 Functional | Dynamic EPLB support. | +| Data Parallel | 🟢 Functional | Data Parallel support for Qwen3 MoE. | +| Prefill Decode Disaggregation | 🟢 Functional | Functional, xPyD is supported. | +| Quantization | 🟢 Functional | W8A8 available; working on more quantization method support(W4A8, etc) | +| Graph Mode | 🔵 Experimental| Experimental, see detail note: [vllm-ascend#767][graph_mode] | +| Sleep Mode | 🟢 Functional | | + +- 🟢 Functional: Fully operational, with ongoing optimizations. +- 🔵 Experimental: Experimental support, interfaces and functions may change. +- 🚧 WIP: Under active development, will be supported soon. +- 🟡 Planned: Scheduled for future implementation (some may have open PRs/RFCs). +- 🔴 NO plan / Deprecated: No plan or deprecated by vLLM. + +[v1_user_guide]: https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html +[multimodal]: https://vllm-ascend.readthedocs.io/en/latest/tutorials/single_npu_multimodal.html +[guided_decoding]: https://github.com/vllm-project/vllm-ascend/issues/177 +[multilora]: https://github.com/vllm-project/vllm-ascend/issues/396 +[v1 multilora]: https://github.com/vllm-project/vllm-ascend/pull/893 +[graph_mode]: https://github.com/vllm-project/vllm-ascend/issues/767 +[apc]: https://github.com/vllm-project/vllm-ascend/issues/732 +[cp]: https://docs.vllm.ai/en/stable/performance/optimization.html#chunked-prefill +[1P1D]: https://github.com/vllm-project/vllm-ascend/pull/950 +[ray]: https://github.com/vllm-project/vllm-ascend/issues/1751 diff --git a/docs/source/user_guide/support_matrix/supported_models.md b/docs/source/user_guide/support_matrix/supported_models.md new file mode 100644 index 0000000..7cd8ff3 --- /dev/null +++ b/docs/source/user_guide/support_matrix/supported_models.md @@ -0,0 +1,79 @@ +# Model Support + +Get the newest info here: https://github.com/vllm-project/vllm-ascend/issues/1608 + +## Text-only Language Models + +### Generative Models + +| Model | Supported | Note | +|-------------------------------|-----------|----------------------------------------------------------------------| +| DeepSeek v3 | ✅ | | +| DeepSeek R1 | ✅ | | +| DeepSeek Distill (Qwen/LLama) | ✅ | | +| Qwen3 | ✅ | | +| Qwen3-based | ✅ | | +| Qwen3-Coder | ✅ | | +| Qwen3-Moe | ✅ | | +| Qwen2.5 | ✅ | | +| Qwen2 | ✅ | | +| Qwen2-based | ✅ | | +| QwQ-32B | ✅ | | +| LLama2/3/3.1 | ✅ | | +| Internlm | ✅ | [#1962](https://github.com/vllm-project/vllm-ascend/issues/1962) | +| Baichuan | ✅ | | +| Baichuan2 | ✅ | | +| Phi-4-mini | ✅ | | +| MiniCPM | ✅ | | +| MiniCPM3 | ✅ | | +| Ernie4.5 | ✅ | | +| Ernie4.5-Moe | ✅ | | +| Gemma-2 | ✅ | | +| Gemma-3 | ✅ | | +| Phi-3/4 | ✅ | | +| Mistral/Mistral-Instruct | ✅ | | +| GLM-4.5 | ✅ | | +| GLM-4 | ❌ | [#2255](https://github.com/vllm-project/vllm-ascend/issues/2255) | +| GLM-4-0414 | ❌ | [#2258](https://github.com/vllm-project/vllm-ascend/issues/2258) | +| ChatGLM | ❌ | [#554](https://github.com/vllm-project/vllm-ascend/issues/554) | +| DeepSeek v2.5 | 🟡 | Need test | +| Mllama | 🟡 | Need test | +| MiniMax-Text | 🟡 | Need test | + +### Pooling Models + +| Model | Supported | Note | +|-------------------------------|-----------|----------------------------------------------------------------------| +| Qwen3-Embedding | ✅ | | +| Molmo | ✅ | [1942](https://github.com/vllm-project/vllm-ascend/issues/1942) | +| XLM-RoBERTa-based | ❌ | [1960](https://github.com/vllm-project/vllm-ascend/issues/1960) | + +## Multimodal Language Models + +### Generative Models + +| Model | Supported | Note | +|--------------------------------|---------------|----------------------------------------------------------------------| +| Qwen2-VL | ✅ | | +| Qwen2.5-VL | ✅ | | +| Qwen2.5-Omni | ✅ | [1760](https://github.com/vllm-project/vllm-ascend/issues/1760) | +| QVQ | ✅ | | +| LLaVA 1.5/1.6 | ✅ | [1962](https://github.com/vllm-project/vllm-ascend/issues/1962) | +| InternVL2 | ✅ | | +| InternVL2.5 | ✅ | | +| Qwen2-Audio | ✅ | | +| Aria | ✅ | | +| LLaVA-Next | ✅ | | +| LLaVA-Next-Video | ✅ | | +| MiniCPM-V | ✅ | | +| Mistral3 | ✅ | | +| Phi-3-Vison/Phi-3.5-Vison | ✅ | | +| Gemma3 | ✅ | | +| LLama4 | ❌ | [1972](https://github.com/vllm-project/vllm-ascend/issues/1972) | +| LLama3.2 | ❌ | [1972](https://github.com/vllm-project/vllm-ascend/issues/1972) | +| Keye-VL-8B-Preview | ❌ | [1963](https://github.com/vllm-project/vllm-ascend/issues/1963) | +| Florence-2 | ❌ | [2259](https://github.com/vllm-project/vllm-ascend/issues/2259) | +| GLM-4V | ❌ | [2260](https://github.com/vllm-project/vllm-ascend/issues/2260) | +| InternVL2.0/2.5/3.0
InternVideo2.5/Mono-InternVL | ❌ | [2064](https://github.com/vllm-project/vllm-ascend/issues/2064) | +| Whisper | ❌ | [2262](https://github.com/vllm-project/vllm-ascend/issues/2262) | +| Ultravox | 🟡 Need test | | diff --git a/examples/disaggregated_prefill_v1/README.md b/examples/disaggregated_prefill_v1/README.md new file mode 100644 index 0000000..eec8924 --- /dev/null +++ b/examples/disaggregated_prefill_v1/README.md @@ -0,0 +1,246 @@ +# Disaggregated Prefill-Decode Deployment Guide + +## Overview +This demo document provides instructions for running a disaggregated vLLM-ascend service with separate prefill and decode stages across 4 nodes, uses 16 Ascend NPUs for two prefill nodes (P1/P2) and 16 Ascend NPUS for two decode nodes (D1/D2). + +## Prerequisites +- Ascend NPU environment with vLLM 0.9.1 installed +- Network interfaces configured for distributed communication (eg: eth0) +- Model weights located at `/models/deepseek_r1_w8a8` + +## Rank table generation +The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. The following command generates a rank table for all nodes with 16 cards prefill and 16 cards decode: + +Run the following command on every node to generate the rank table: +```shell +cd /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ +bash gen_ranktable.sh --ips 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 \ + --npus-per-node 8 --network-card-name eth0 --prefill-device-cnt 16 --decode-device-cnt 16 +``` +Rank table will generated at `/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json` + +## Start disaggregated vLLM-ascend service +For demonstration purposes, we will utilize the quantized version of Deepseek-R1. Recommended Parallelization Strategies: +- P-node: DP2-TP8-EP16 (Data Parallelism 2, Tensor Parallelism 8, Expert Parallelism 16) +- D-node: DP4-TP4-EP16 (Data Parallelism 4, Tensor Parallelism 4, Expert Parallelism 16) + +Execution Sequence +- 4 configured node ip are: 172.19.32.175 172.19.241.49 172.19.123.51 172.19.190.36 +- Start Prefill on Node 1 (P1) +- Start Prefill on Node 2 (P2) +- Start Decode on Node 1 (D1) +- Start Decode on Node 2 (D2) +- Start proxy server on Node1 + +Run prefill server P1 on first node: +```shell +export HCCL_IF_IP=172.19.32.175 # node ip +export GLOO_SOCKET_IFNAME="eth0" # network card name +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export VLLM_LLMDD_RPC_PORT=5559 + +vllm serve /models/deepseek_r1_w8a8 \ + --host 0.0.0.0 \ + --port 20002 \ + --data-parallel-size 2 \ + --data-parallel-size-local 1 \ + --api-server-count 2 \ + --data-parallel-address 172.19.32.175 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"chunked_prefill_for_mla":true}' +``` + +Run prefill server P2 on second node: +```shell +export HCCL_IF_IP=172.19.241.49 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export VLLM_LLMDD_RPC_PORT=5659 + +vllm serve /models/deepseek_r1_w8a8 \ + --host 0.0.0.0 \ + --port 20002 \ + --headless \ + --data-parallel-size 2 \ + --data-parallel-start-rank 1 \ + --data-parallel-size-local 1 \ + --data-parallel-address 172.19.32.175 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 8 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 32768 \ + --max-num-batched-tokens 32768 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --enforce-eager \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"chunked_prefill_for_mla":true}' +``` + +Run decode server d1 on third node: + +* In the D node, the `max-num-batched-tokens` parameter can be set to a smaller value since the D node processes at most `max-num-seqs` batches concurrently. As the `profile_run` only needs to handle `max-num-seqs` sequences at a time, we can safely set `max-num-batched-tokens` equal to `max-num-seqs`. This optimization will help reduce activation memory consumption. +```shell +export HCCL_IF_IP=172.19.123.51 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export VLLM_LLMDD_RPC_PORT=5759 + +vllm serve /models/deepseek_r1_w8a8 \ + --host 0.0.0.0 \ + --port 20002 \ + --data-parallel-size 4 \ + --data-parallel-size-local 2 \ + --api-server-count 2 \ + --data-parallel-address 172.19.123.51 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 32768 \ + --max-num-batched-tokens 256 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":true}}' +``` + +Run decode server d2 on last node: +```shell +export HCCL_IF_IP=172.19.190.36 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1/ranktable.json +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export VLLM_LLMDD_RPC_PORT=5859 + +vllm serve /models/deepseek_r1_w8a8 \ + --host 0.0.0.0 \ + --port 20002 \ + --headless \ + --data-parallel-size 4 \ + --data-parallel-start-rank 2 \ + --data-parallel-size-local 2 \ + --data-parallel-address 172.19.123.51 \ + --data-parallel-rpc-port 13356 \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name deepseek \ + --max-model-len 32768 \ + --max-num-batched-tokens 256 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"torchair_graph_config": {"enabled":true}}' +``` + +Run proxy server on the first node: +```shell +cd /vllm-workspace/vllm-ascend/examples/disaggregated_prefill_v1 +python toy_proxy_server.py --host 172.19.32.175 --port 1025 --prefiller-hosts 172.19.241.49 --prefiller-port 20002 --decoder-hosts 172.19.123.51 --decoder-ports 20002 +``` + +Verification +Check service health using the proxy server endpoint: +```shell +curl http://localhost:1025/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "deepseek", + "prompt": "Who are you?", + "max_tokens": 100, + "temperature": 0 + }' +``` + +Performance +Test performance with vllm benchmark: +```shell +cd /vllm-workspace/vllm/benchmarks +python3 benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --random-input-len 4096 \ + --random-output-len 1536 \ + --num-prompts 256 \ + --ignore-eos \ + --model deepseek \ + --tokenizer /models/deepseek_r1_w8a8 \ + --host localhost \ + --port 1025 \ + --endpoint /v1/completions \ + --max-concurrency 4 \ + --request-rate 4 +``` diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.py b/examples/disaggregated_prefill_v1/gen_ranktable.py new file mode 100644 index 0000000..52db3ee --- /dev/null +++ b/examples/disaggregated_prefill_v1/gen_ranktable.py @@ -0,0 +1,122 @@ +import argparse +import json +import os + +import torch.distributed as dist + +from vllm_ascend.utils import AscendSocVersion, init_ascend_soc_version, get_ascend_soc_version + +parser = argparse.ArgumentParser( + description="Arguments of rank table generator", ) +parser.add_argument("--local-host", type=str, required=True, help="local ip") +parser.add_argument("--prefill-device-cnt", + type=int, + required=True, + help="number of prefill devices") +parser.add_argument("--decode-device-cnt", + type=int, + required=True, + help="number of decode devices") +args = parser.parse_args() +local_host = args.local_host +prefill_device_cnt = args.prefill_device_cnt +decode_device_cnt = args.decode_device_cnt + +print("enter py") + +hccn_tool_path = os.environ.get("HCCN_TOOL_PATH", + "/usr/local/Ascend/driver/tools/hccn_tool") +master_addr = os.environ.get("MASTER_ADDR") +master_port = os.environ.get("MASTER_PORT") +rank = os.environ.get("RANK") +local_rank = os.environ.get("LOCAL_RANK") +# This variable is set by torchrun, +# and is different from WORLD_SIZE in gen_rank_table.sh. +world_size = os.environ.get("WORLD_SIZE") + +init_ascend_soc_version() +soc_info = get_ascend_soc_version() + + +def get_cmd_stdout(cmd): + import subprocess + return subprocess.run(cmd, capture_output=True, + shell=True).stdout.decode("utf-8").strip() + + +print(f"local_host: {local_host}") +print("gen ranktable.json") + +num_cards = get_cmd_stdout("npu-smi info -l | grep \"Total Count\"").split( + ":")[1].strip() +num_cards = int(num_cards) +chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split( + "\n")[0].split(":")[1].strip() +chips_per_card = int(chips_per_card) + +# generate local device list for local rank 0, and gather it to all ranks +local_device_list: list[dict[str, str]] = list() +if local_rank == "0": + super_pod_id = "0" + for card_id in range(num_cards): + for chip_id in range(chips_per_card): + device_id = card_id * chips_per_card + chip_id + if soc_info == AscendSocVersion.A3: + device_ip = get_cmd_stdout( + f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr" + ).split(":")[1].strip() + super_device_id = get_cmd_stdout( + f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID" + ).split(":")[1].strip() + super_pod_id = get_cmd_stdout( + f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\"" + ).split(":")[1].strip() + else: + device_ip = get_cmd_stdout( + f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr" + ).split(":")[1].strip() + + device_info = { + "server_id": local_host, + "device_id": str(device_id), + "device_ip": str(device_ip), + } + if soc_info == AscendSocVersion.A3: + device_info.update({ + "super_pod_id": str(super_pod_id), + "super_device_id": str(super_device_id) + }) + local_device_list.append(device_info) + +dist.init_process_group(backend=dist.Backend.GLOO) +global_device_list = [None] * dist.get_world_size() +dist.all_gather_object(global_device_list, local_device_list) +global_device_list = [ + device_info for device_list in global_device_list + for device_info in device_list # type: ignore[attr-defined] +] +cnt = 1 +for device_info in global_device_list: # type: ignore[assignment] + device_info["cluster_id"] = str(cnt) + cnt += 1 +assert (prefill_device_cnt + decode_device_cnt) <= len(global_device_list), \ +"prefill_device_cnt + decode_device_cnt must be less than or equal to number of all devices in cluster" +ranktable = { + "version": + "1.2", + "server_count": + str(world_size), + "prefill_device_list": + global_device_list[:prefill_device_cnt], + "decode_device_list": + global_device_list[prefill_device_cnt:prefill_device_cnt + + decode_device_cnt], + "status": + "completed" +} + +if local_rank == '0': + with open("ranktable.json", "w") as f: + json.dump(ranktable, f, indent=4) + + print("gen ranktable.json done") diff --git a/examples/disaggregated_prefill_v1/gen_ranktable.sh b/examples/disaggregated_prefill_v1/gen_ranktable.sh new file mode 100644 index 0000000..e8a923a --- /dev/null +++ b/examples/disaggregated_prefill_v1/gen_ranktable.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +source /usr/local/Ascend/ascend-toolkit/set_env.sh +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} + +NPUS_PER_NODE=8 +while [[ $# -gt 0 ]]; do + case "$1" in + --ips) + shift + while [[ $# -gt 0 && ! "$1" == --* ]]; do + IPs+=("$1") + shift + done + ;; + --npus-per-node) + shift + NPUS_PER_NODE="$1" + shift + ;; + --network-card-name) + shift + NETWORK_CARD_NAME="$1" + shift + ;; + --prefill-device-cnt) + shift + PREFILL_DEVICE_CNT="$1" + shift + ;; + --decode-device-cnt) + shift + DECODE_DEVICE_CNT="$1" + shift + ;; + esac +done +LOCAL_HOSTS=($(hostname -I)) +LOCAL_HOST="127.0.0.1" +MASTER_ADDR=${IPs[0]} +MASTER_PORT=6657 +NNODES=${#IPs[@]} +NODE_RANK="8" +for i in "${!IPs[@]}"; do + ip="${IPs[$i]}" + for local_host in "${LOCAL_HOSTS[@]}"; do + if [[ "$local_host" == "$ip" ]]; then + LOCAL_HOST=$local_host + NODE_RANK=$i + break 2 + fi + done +done + +if [[ $NODE_RANK == "" ]];then + echo "[Error] para \"NODE_RANK\" must be defined" + exit 1 +fi + +WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) +RANKSTART=`expr $NPUS_PER_NODE \* $NODE_RANK` + +echo "========>param:" +echo "LOCAL_HOST": $LOCAL_HOST +echo "WORLD_SIZE: " $WORLD_SIZE +echo "RANKSTART": $RANKSTART +echo "NNODES": $NNODES +echo "NODE_RANK": $NODE_RANK +echo "===============" + +if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then + GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \ + --nproc_per_node 1 \ + --nnodes ${NNODES} \ + --node_rank ${NODE_RANK} \ + --master_addr ${MASTER_ADDR} \ + --master_port ${MASTER_PORT} \ + gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT +fi diff --git a/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py new file mode 100644 index 0000000..727233e --- /dev/null +++ b/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py @@ -0,0 +1,546 @@ +# Adapted from https://github.com/vllm-project/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py + +# SPDX-License-Identifier: Apache-2.0 +# +# Tutorial: Using the Load Balance Proxy Server Example +# +# This proxy server is designed to distribute requests between multiple +# "prefiller" and "decoder" backend servers for large language model inference. +# It is useful for scaling out inference workloads and balancing load across +# multiple backend instances. +# +# Features: +# - Load balances requests to multiple prefiller and decoder servers. +# - Supports OpenAI-compatible /v1/completions and /v1/chat/completions endpoints. +# - Streams responses from backend servers to clients. +# +# Prerequisites: +# - Python 3.8+ +# - Install dependencies: +# pip install fastapi httpx uvicorn vllm +# +# Step 1: Start Your Backend Servers +# ---------------------------------- +# You need to have at least one prefiller and one decoder backend running. +# These can be mock servers or actual vLLM servers. +# +# For testing, you can use the provided mock server: +# +# vllm serve --host 0.0.0.0 --port 8100 ... # Prefiller 1 +# vllm serve --host 0.0.0.0 --port 8101 ... # Prefiller 2 +# vllm serve --host 0.0.0.0 --port 8200 ... # Decoder 1 +# vllm serve --host 0.0.0.0 --port 8201 ... # Decoder 2 +# +# Step 2: Start the Proxy Server +# ------------------------------ +# Run the proxy server, specifying the host/port for each prefiller and decoder: +# +# python load_balance_proxy_server_example.py \ +# --host 0.0.0.0 --port 9000 \ +# --prefiller-hosts 127.0.0.1 127.0.0.1 \ +# --prefiller-ports 8100 8101 \ +# --decoder-hosts 127.0.0.1 127.0.0.1 \ +# --decoder-ports 8200 8201 +# +# This will start the proxy on port 9000, load balancing between two prefiller +# and two decoder servers. +# +# Step 3: Send a Request to the Proxy +# ----------------------------------- +# You can now send OpenAI-compatible requests to the proxy. For example: +# +# curl -X POST http://localhost:9000/v1/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "your-model", +# "prompt": "The quick brown fox jumps over the lazy dog", +# "max_tokens": 16 +# }' +# +# Or for chat completions: +# +# curl -X POST http://localhost:9000/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "your-model", +# "messages": [{"role": "user", "content": "Hello!"}], +# "max_tokens": 16 +# }' +# +# Step 4: Health Check +# -------------------- +# To check if the proxy is running and see how many backend instances are +# connected, use: +# +# curl http://localhost:9000/healthcheck +# +# This will return a JSON object with the status and the number of prefiller +# and decoder instances. +# +# Notes: +# - You can scale the number of prefiller and decoder servers as needed. +# - The proxy will round-robin requests to balance load. +# - For production, ensure your backend servers are robust and secure. +# +# For more details, see the code and comments in this file. + + +import argparse +import asyncio +import functools +import heapq +import os +import sys +import uuid +from contextlib import asynccontextmanager +from typing import List + +import httpx +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse +from vllm.logger import init_logger + +logger = init_logger(__name__) + +# Add uvloop for faster event loop if available +try: + import uvloop + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) +except ImportError: + pass + + +class ServerState: + + def __init__(self, host, port): + self.host = host + self.port = port + self.url = f'http://{host}:{port}/v1' + self.client = httpx.AsyncClient(timeout=None, + base_url=self.url, + limits=httpx.Limits( + max_connections=100000, + max_keepalive_connections=100000)) + self.active_tokens = 0 + self.active_kv_cache = 0 # Only for prefiller + self.active_requests = 0 # Number of active requests + self.aborted_requests = set() # Track aborted requests + # Removed individual server lock - will use global locks instead + + +class ProxyState: + + def __init__(self, prefiller_instances, decoder_instances): + self.prefillers: List[ServerState] = [ + ServerState(h, p) for h, p in prefiller_instances + ] + self.decoders: List[ServerState] = [ + ServerState(h, p) for h, p in decoder_instances + ] + self.req_to_prefiller = {} + self.req_id_lock = asyncio.Lock() + # Removed selection locks - no longer needed for synchronous methods + + # Initialize priority queues for efficient server selection + # Each entry is (priority_score, server_index, server_reference) + # Lower priority score = higher priority (less loaded) + self.prefiller_heap = [(0, i, server) + for i, server in enumerate(self.prefillers)] + self.decoder_heap = [(0, i, server) + for i, server in enumerate(self.decoders)] + heapq.heapify(self.prefiller_heap) + heapq.heapify(self.decoder_heap) + + def _update_prefiller_priority(self, server_idx: int): + """Update the priority of a prefiller server in the heap.""" + server = self.prefillers[server_idx] + # Priority based on active_tokens and active_kv_cache + priority = server.active_tokens + server.active_kv_cache * 0.3 + # Remove old entry and add new one + self.prefiller_heap = [(p, i, s) for p, i, s in self.prefiller_heap + if i != server_idx] + heapq.heappush(self.prefiller_heap, + (priority, server_idx, server)) # type: ignore + + def _update_decoder_priority(self, server_idx: int): + """Update the priority of a decoder server in the heap.""" + server = self.decoders[server_idx] + priority = server.active_tokens + # Remove old entry and add new one + self.decoder_heap = [(p, i, s) for p, i, s in self.decoder_heap + if i != server_idx] + heapq.heappush(self.decoder_heap, + (priority, server_idx, server)) # type: ignore + + def abort_prefiller_request(self, server_idx: int, + request_id): # Changed to synchronous + """ + Mark a request as aborted. This will helps to release kv cache in + prefiller node. + """ + # No lock needed - atomic operation + self.prefillers[server_idx].aborted_requests.add(request_id) + + def aquire_aborted_prefiller_requests( + self, server_idx: int): # Changed to synchronous + """ + Get the set of aborted requests and clear it. + This is used to release kv cache in prefiller node. + """ + # No lock needed - atomic operation + aborted_requests = self.prefillers[server_idx].aborted_requests.copy() + self.prefillers[server_idx].aborted_requests.clear() + return aborted_requests + + async def next_req_id(self): + async with self.req_id_lock: + return str(uuid.uuid4()) + + def select_prefiller(self, token_count): # Changed to synchronous + # No lock needed - entire function is atomic + if not self.prefiller_heap: + raise RuntimeError("No prefiller servers available") + + priority, chosen, server = heapq.heappop(self.prefiller_heap) + + # Update the chosen server atomically + self.prefillers[chosen].active_tokens += token_count + self.prefillers[chosen].active_kv_cache += token_count + + # Update priority and re-add to heap + self._update_prefiller_priority(chosen) + + return chosen + + def release_prefiller(self, idx, token_count): # Changed to synchronous + # No lock needed - atomic operation + self.prefillers[idx].active_tokens -= token_count + # Update priority queue after releasing + self._update_prefiller_priority(idx) + + def release_prefiller_kv(self, idx, token_count): # Changed to synchronous + # No lock needed - atomic operation + if self.prefillers[idx].active_kv_cache > 0: + self.prefillers[idx].active_kv_cache -= token_count + # Update priority queue after releasing + self._update_prefiller_priority(idx) + + def select_decoder(self, token_count): # Changed to synchronous + # No lock needed - entire function is atomic + if not self.decoder_heap: + raise RuntimeError("No decoder servers available") + + priority, chosen, server = heapq.heappop(self.decoder_heap) + + # Update the chosen server atomically + self.decoders[chosen].active_tokens += token_count + + # Update priority and re-add to heap + self._update_decoder_priority(chosen) + + return chosen + + def release_decoder(self, idx, token_count): # Changed to synchronous + # No lock needed - atomic operation + self.decoders[idx].active_tokens -= token_count + # Update priority queue after releasing + self._update_decoder_priority(idx) + + # Omni_infer's calculate_input_scores function + def calculate_prefill_scores(self, request_length: int) -> float: + length_score = request_length / 4.0 + input_score = length_score * 0.0345 + 120.0745 + return input_score + + def calculate_decode_scores(self, request_length: int) -> float: + return request_length + + +proxy_state = None + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--prefiller-hosts", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--prefiller-ports", + type=int, + nargs="+", + default=[8001]) + parser.add_argument("--decoder-hosts", + type=str, + nargs="+", + default=["localhost"]) + parser.add_argument("--decoder-ports", type=int, nargs="+", default=[8002]) + parser.add_argument("--max-retries", + type=int, + default=3, + help="Maximum number of retries for HTTP requests") + parser.add_argument( + "--retry-delay", + type=float, + default=0.001, + help="Base delay (seconds) for exponential backoff retries") + args = parser.parse_args() + if len(args.prefiller_hosts) != len(args.prefiller_ports): + raise ValueError( + "Number of prefiller hosts must match number of prefiller ports") + if len(args.decoder_hosts) != len(args.decoder_ports): + raise ValueError( + "Number of decoder hosts must match number of decoder ports") + args.prefiller_instances = list( + zip(args.prefiller_hosts, args.prefiller_ports)) + args.decoder_instances = list(zip(args.decoder_hosts, args.decoder_ports)) + return args + + +@asynccontextmanager +async def lifespan(app: FastAPI): + global proxy_state + proxy_state = ProxyState(global_args.prefiller_instances, + global_args.decoder_instances) + print( + f"Initialized {len(proxy_state.prefillers)} prefill clients and {len(proxy_state.decoders)} decode clients." + ) + yield + for p in proxy_state.prefillers: + await p.client.aclose() + for d in proxy_state.decoders: + await d.client.aclose() + + +async def listen_for_disconnect(request: Request) -> None: + """Return if a disconnect message is received""" + while True: + message = await request.receive() + if message["type"] == "http.disconnect": + break + + +def with_cancellation(handler_func): + + @functools.wraps(handler_func) + async def wrapper(*args, **kwargs): + request = kwargs["request"] + handler_task = asyncio.create_task(handler_func(*args, **kwargs)) + cancellation_task = asyncio.create_task(listen_for_disconnect(request)) + done, pending = await asyncio.wait([handler_task, cancellation_task], + return_when=asyncio.FIRST_COMPLETED) + for task in pending: + task.cancel() + if handler_task in done: + return handler_task.result() + return None + + return wrapper + + +app = FastAPI(lifespan=lifespan) + + +async def send_request_to_service(client: httpx.AsyncClient, + prefiller_id: int, + endpoint: str, + req_data: dict, + request_id: str, + max_retries: int = 3, + base_delay: float = 0.2): + aborted_requests = proxy_state.aquire_aborted_prefiller_requests( + prefiller_id) + req_data = req_data.copy() + req_data['kv_transfer_params'] = { + "do_remote_decode": True, + "do_remote_prefill": False, + "remote_engine_id": None, + "remote_block_ids": None, + "remote_host": None, + "remote_port": None, + "aborted_request": list(aborted_requests), + } + req_data["stream"] = False + req_data["max_tokens"] = 1 + if "stream_options" in req_data: + del req_data["stream_options"] + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + last_exc = None + for attempt in range(1, max_retries + 1): + try: + response = await client.post(endpoint, + json=req_data, + headers=headers) + response.raise_for_status() + return response + except (httpx.RequestError, httpx.HTTPStatusError) as e: + logger.warning( + f"Attempt {attempt} failed for {endpoint}: {str(e)}") + last_exc = e + if attempt < max_retries: + await asyncio.sleep(base_delay * (2**(attempt - 1))) + else: + logger.error( + f"All {max_retries} attempts failed for {endpoint}.") + raise last_exc + + +async def stream_service_response_with_retry(client: httpx.AsyncClient, + endpoint: str, + req_data: dict, + request_id: str, + max_retries: int = 3, + base_delay: float = 0.2): + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + for attempt in range(1, max_retries + 1): + try: + async with client.stream("POST", + endpoint, + json=req_data, + headers=headers) as response: + response.raise_for_status() + first_chunk_sent = False + async for chunk in response.aiter_bytes(): + first_chunk_sent = True + yield chunk + return # Success, exit after streaming + except (httpx.RequestError, httpx.HTTPStatusError) as e: + if attempt < max_retries: + logger.warning( + f"Attempt {attempt} failed for streaming {endpoint}: {str(e)}" + ) + await asyncio.sleep(base_delay * (2**(attempt - 1))) + else: + logger.error( + f"All {max_retries} attempts failed for streaming {endpoint}." + ) + raise e + except Exception as e: + # If any chunk has been sent, do not retry, just log and drop + if 'first_chunk_sent' in locals() and first_chunk_sent: + logger.error( + f"Streaming to client interrupted after response started: {str(e)}" + ) + return + else: + if attempt < max_retries: + logger.warning( + f"Attempt {attempt} failed for streaming {endpoint}: {str(e)}" + ) + await asyncio.sleep(base_delay * (2**(attempt - 1))) + else: + logger.error( + f"All {max_retries} attempts failed for streaming {endpoint}." + ) + raise e + + +async def _handle_completions(api: str, request: Request): + try: + req_data = await request.json() + req_body = await request.body() + request_length = len(req_body) + prefiller_score = proxy_state.calculate_prefill_scores(request_length) + logger.debug( + f"Request length: {request_length}, Prefiller score: {prefiller_score}" + ) + request_id = await proxy_state.next_req_id() + # Select prefiller + prefiller_idx = proxy_state.select_prefiller(prefiller_score) + prefiller = proxy_state.prefillers[prefiller_idx] + # Send request to prefiller + response = await send_request_to_service( + prefiller.client, + prefiller_idx, + api, + req_data, + request_id, + max_retries=global_args.max_retries, + base_delay=global_args.retry_delay) + proxy_state.release_prefiller(prefiller_idx, prefiller_score) + response_json = response.json() + kv_transfer_params = response_json.get('kv_transfer_params', {}) + if kv_transfer_params: + req_data["kv_transfer_params"] = kv_transfer_params + # Select decoder + decoder_score = proxy_state.calculate_decode_scores(request_length) + logger.debug("Decoder score: %f", decoder_score) + # Use the prefiller's kv_transfer_params to select decoder + decoder_idx = proxy_state.select_decoder(decoder_score) + decoder = proxy_state.decoders[decoder_idx] + logger.debug("Using %s %s", prefiller.url, decoder.url) + # Stream response from decoder + released_kv = False + + async def generate_stream(): + nonlocal released_kv + # Only one await per chunk, minimal logic in loop + try: + async for chunk in stream_service_response_with_retry( + decoder.client, + api, + req_data, + request_id=request_id, + max_retries=global_args.max_retries, + base_delay=global_args.retry_delay): + if not released_kv and chunk: + proxy_state.release_prefiller_kv( + prefiller_idx, prefiller_score) + released_kv = True + yield chunk + except Exception as e: + logger.error( + f"Error during streaming from decoder {decoder.url}: {str(e)} the aborted request {request_id} will be routing to the target prefiller when new request is ready to dispatch to it" + ) + proxy_state.abort_prefiller_request(prefiller_idx, request_id) + proxy_state.release_prefiller_kv(prefiller_idx, + prefiller_score) + + # After streaming done, release tokens + proxy_state.release_decoder(decoder_idx, decoder_score) + + return StreamingResponse(generate_stream(), + media_type="application/json") + except Exception as e: + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server" + f" - {api} endpoint") + print(e) + print("".join(traceback.format_exception(*exc_info))) + raise + + +@app.post("/v1/completions") +@with_cancellation +async def handle_completions(request: Request): + return await _handle_completions("/completions", request) + + +@app.post("/v1/chat/completions") +@with_cancellation +async def handle_chat_completions(request: Request): + return await _handle_completions("/chat/completions", request) + + +@app.get("/healthcheck") +async def healthcheck(): + return { + "status": "ok", + "prefill_instances": len(proxy_state.prefillers), + "decode_instances": len(proxy_state.decoders) + } + + +if __name__ == '__main__': + global global_args + global_args = parse_args() + import uvicorn + uvicorn.run(app, host=global_args.host, port=global_args.port) diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md new file mode 100644 index 0000000..3e916cc --- /dev/null +++ b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md @@ -0,0 +1,165 @@ +# Mooncake connector deployment Guide + +## Environmental Dependencies + + * Software: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (same version as vllm-ascend) + * mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md + +The vllm version must be the same as the main branch of vllm-ascend, for example, 2025/07/30. The version is + + * vllm: v0.10.1 + * vllm-ascend: v0.10.1rc1 + +## run + +### 1.Run `prefill` Node + +``` +bash run_prefill.sh +``` + +Content of the run_prefill.sh script + +``` +export HCCL_EXEC_TIMEOUT=204 +export HCCL_CONNECT_TIMEOUT=120 +export HCCL_IF_IP=localhost +export GLOO_SOCKET_IFNAME="xxxxxx" +export TP_SOCKET_IFNAME="xxxxxx" +export HCCL_SOCKET_IFNAME="xxxxxx" +export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 +export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -) + +vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \ + --host localhost \ + --port 8100 \ + --tensor-parallel-size 2\ + --seed 1024 \ + --max-model-len 2000 \ + --max-num-batched-tokens 2000 \ + --trust-remote-code \ + --enforce-eager \ + --data-parallel-size 2 \ + --data-parallel-address localhost \ + --data-parallel-rpc-port 9100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_buffer_device": "npu", + "kv_role": "kv_producer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": "0", + "kv_rank": 0, + "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 2 + }, + "decode": { + "dp_size": 2, + "tp_size": 2 + } + } + }' +``` + +`HCCL_EXEC_TIMEOUT`, `HCCL_CONNECT_TIMEOUT`, and `HCCL_IF_IP` are hccl-related configurations.
+Set `GLOO_SOCKET_IFNAME`, `TP_SOCKET_IFNAME`, and `HCCL_SOCKET_IFNAME` to the corresponding NIC.
+`ASCEND_RT_VISIBLE_DEVICES` specifies the cards on which the node run resides. The total number of cards equals `dp_size*tp_size`.
+`/xxxxx/DeepSeek-V2-Lite-Chat` is configured as a model that requires run.
+`--host`: indicates the IP address of the node to be started.
+`--port`: indicates the port to be started, which corresponds to the port in step 4.
+`--seed`, --max-model-len, and --max-num-batched-tokens model basic configuration. Set this parameter based on the site requirements.
+`--tensor-parallel-size`: specifies the TP size.
+`--data-parallel-size`: indicates the DP size.
+`--data-parallel-address`: indicates the IP address of the DP. Set this parameter to the IP address of the node.--data-parallel-rpc-port: indicates the RPC port for communication in the DP group.
+`--trust-remote-code` can load the local model.
+`--enforce-eager` Turn off the map mode
+`--gpu-memory-utilization`: Percentage of video memory occupied by the card
+`--kv-transfer-config`: follow kv_connector, kv_connector_module_path: mooncakeconnect, kv_buffer_device, and run on the NPU card. For kv_role, set kv_producer to the p node, kv_consumer to the d node, kv_parallel_size to 1, and kv_port to the port used by the node. For the p node, set engine_id and kv_rank to 0 and for the d node to 1. Configure the distributed parallel policy for the p and d nodes in the kv_connector_extra_config file based on --tensor-parallel-size and --data-parallel-size.
+ + +### 2. Run `decode` Node + +``` +bash run_decode.sh +``` + +Content of the run_decode.sh script + +``` +export HCCL_EXEC_TIMEOUT=204 +export HCCL_CONNECT_TIMEOUT=120 +export HCCL_IF_IP=localhost +export GLOO_SOCKET_IFNAME="xxxxxx" +export TP_SOCKET_IFNAME="xxxxxx" +export HCCL_SOCKET_IFNAME="xxxxxx" +export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 +export PHYSICAL_DEVICES=$(ls /dev/davinci* 2>/dev/null | grep -o '[0-9]\+' | sort -n | paste -sd',' -) + +vllm serve "/xxxxx/DeepSeek-V2-Lite-Chat" \ + --host localhost \ + --port 8200 \ + --tensor-parallel-size 2\ + --seed 1024 \ + --max-model-len 2000 \ + --max-num-batched-tokens 2000 \ + --trust-remote-code \ + --enforce-eager \ + --data-parallel-size 2 \ + --data-parallel-address localhost \ + --data-parallel-rpc-port 9100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector": "MooncakeConnectorV1", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20002", + "engine_id": "1", + "kv_rank": 1, + "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", + "kv_connector_extra_config": { + "prefill": { + "dp_size": 2, + "tp_size": 2 + }, + "decode": { + "dp_size": 2, + "tp_size": 2 + } + } + }' +``` + +### 3. Start proxy_server. ### + +``` +cd /vllm-ascend/examples/disaggregate_prefill_v1/ +python load_balance_proxy_server_example.py --host localhost --prefiller-hosts host1 host2 --prefiller-ports 8100 8101 --decoder-hosts host3 host4 --decoder-ports 8200 8201 +``` + +`--host`: indicates the active node. The value of localhost in the curl command delivered in step 5 must be the same as the host. The default port number for starting the service proxy is 8000.
+`--prefiller-hosts`: Set this parameter to the IP addresses of all p nodes. In the xpyd scenario, add the IP addresses to the end of this configuration item and leave a blank space between the IP addresses.
+`--prefiller-ports`: Set this parameter to the port number of all p nodes, which is the configuration of the port number for the vllm to start the service in step 3. Write the port number after the configuration in sequence and leave a blank space between the port number and the port number. The sequence must be one-to-one mapping to the IP address of --prefiller-hosts.
+`--decoder-hosts`: Set this parameter to the IP addresses of all d nodes. In the xpyd scenario, add the IP addresses to the end of this configuration item and leave a blank space between the IP addresses.
+`--decoder-ports`: Set this parameter to the port number of all d nodes, which is the configuration of the port number for the vllm to start the service in step 4. Set port to the end of the configuration, and leave a blank space between port and port. The sequence must be one-to-one mapping to the IP address of --decoder-hosts.
+ + +### 4. Run Inference + +Set the IP address in the inference file to the actual IP address. Set the model variable to the path of the model. Ensure that the path is the same as that in the shell script. + +``` +curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ +"model": "model_path", +"prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", +"max_tokens": 256 +}' +``` \ No newline at end of file diff --git a/examples/disaggregated_prefill_v1/run_server.sh b/examples/disaggregated_prefill_v1/run_server.sh new file mode 100644 index 0000000..37cf6d3 --- /dev/null +++ b/examples/disaggregated_prefill_v1/run_server.sh @@ -0,0 +1,32 @@ +export HCCL_IF_IP=141.61.39.117 +export GLOO_SOCKET_IFNAME="enp48s3u1u1" +export TP_SOCKET_IFNAME="enp48s3u1u1" +export HCCL_SOCKET_IFNAME="enp48s3u1u1" +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=path-to-rank-table + +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 + +export VLLM_USE_V1=1 + +vllm serve model_path \ + --host 0.0.0.0 \ + --port 20002 \ + --tensor-parallel-size 1\ + --seed 1024 \ + --served-model-name dsv3 \ + --max-model-len 2000 \ + ---max-num-batched-tokens 2000 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": 1, + "kv_port": "20001", + "engine_id": 0, + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_connector_v1_a3" + }' \ + --additional-config \ + '{"enable_graph_mode": "True"}'\ diff --git a/examples/eplb/eplb_deepseek.py b/examples/eplb/eplb_deepseek.py new file mode 100644 index 0000000..fb67f9c --- /dev/null +++ b/examples/eplb/eplb_deepseek.py @@ -0,0 +1,205 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Expert parallelism load balancer (EPLB) for vLLM. +The rearrangement algorithm is adapted from +[DeepSeek EPLB](https://github.com/deepseek-ai/eplb). +""" +from typing import Tuple + +import torch + + +def balanced_packing(weight: torch.Tensor, + num_packs: int) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs + are as balanced as possible. + + Parameters: + weight: [X, n], the weight of each item + num_packs: number of packs + + Returns: + pack_index: [X, n], the pack index of each item + rank_in_pack: [X, n], the rank of the item in the pack + """ + num_layers, num_groups = weight.shape + assert num_groups % num_packs == 0 + groups_per_pack = num_groups // num_packs + + if groups_per_pack == 1: + pack_index = torch.arange(weight.size(-1), + dtype=torch.int64, + device=weight.device).expand(weight.shape) + rank_in_pack = torch.zeros_like(weight, dtype=torch.int64) + return pack_index, rank_in_pack + + indices = weight.float().sort(-1, descending=True).indices.cpu() + pack_index = torch.full_like(weight, + fill_value=-1, + dtype=torch.int64, + device='cpu') + rank_in_pack = torch.full_like(pack_index, fill_value=-1) + for i in range(num_layers): + pack_weights = [0] * num_packs + pack_items = [0] * num_packs + for group in indices[i]: + pack = min( + (i + for i in range(num_packs) if pack_items[i] < groups_per_pack), + key=pack_weights.__getitem__) + assert pack_items[pack] < groups_per_pack + pack_index[i, group] = pack + rank_in_pack[i, group] = pack_items[pack] + pack_weights[pack] += weight[i, group] + pack_items[pack] += 1 + return pack_index, rank_in_pack + + +def replicate_experts( + weight: torch.Tensor, + num_phy: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized. + + Parameters: + weight: [X, num_log] + num_phy: total number of experts after replication + + Returns: + phy2log: [X, num_phy], logical expert id of each physical expert + rank: [X, num_phy], the replica rank + logcnt: [X, num_log], number of replicas for each logical expert + """ + n, num_log = weight.shape + num_redundant = num_phy - num_log + assert num_redundant >= 0 + device = weight.device + phy2log = torch.arange(num_phy, dtype=torch.int64, + device=device).repeat(n, 1) + rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device) + logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device) + arangen = torch.arange(n, dtype=torch.int64, device=device) + for i in range(num_log, num_phy): + redundant_indices = (weight / logcnt).max(dim=-1).indices + phy2log[:, i] = redundant_indices + rank[:, i] = logcnt[arangen, redundant_indices] + logcnt[arangen, redundant_indices] += 1 + return phy2log, rank, logcnt + + +def rebalance_experts_hierarchical(weight: torch.Tensor, + num_physical_experts: int, num_groups: int, + num_nodes: int, num_gpus: int): + """ + Parameters: + weight: [num_moe_layers, num_logical_experts] + num_physical_experts: number of physical experts after replication + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: [num_moe_layers, num_physical_experts] + logical_to_physical_map: [num_moe_layers, num_logical_experts, X] + logical_count: [num_moe_layers, num_logical_experts] + """ + num_layers, num_logical_experts = weight.shape + assert num_logical_experts % num_groups == 0 + group_size = num_logical_experts // num_groups + assert num_groups % num_nodes == 0 + groups_per_node = num_groups // num_nodes + assert num_gpus % num_nodes == 0 + assert num_physical_experts % num_gpus == 0 + phy_experts_per_gpu = num_physical_experts // num_gpus + + def inverse(perm: torch.Tensor) -> torch.Tensor: + inv = torch.empty_like(perm) + inv.scatter_( + 1, perm, + torch.arange(perm.size(1), dtype=torch.int64, + device=perm.device).expand(perm.shape)) + return inv + + # Step 1: pack groups to nodes + tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1) + group_pack_index, group_rank_in_pack = balanced_packing( + tokens_per_group, num_nodes) + log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) * + group_size).unsqueeze(-1) + + torch.arange(group_size, + dtype=torch.int64, + device=group_pack_index.device)).flatten(-2) + mlog2log = inverse(log2mlog) + + # Step 2: construct redundant experts within nodes + # [num_layers * num_nodes, num_logical_experts // num_nodes] + tokens_per_mlog = weight.gather(-1, mlog2log).view( + -1, num_logical_experts // num_nodes) + phy2mlog, phyrank, mlogcnt = replicate_experts( + tokens_per_mlog, num_physical_experts // num_nodes) + + # Step 3: pack physical_experts to GPUs + # [num_layers * num_nodes, num_physical_experts // num_nodes] + tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog) + pack_index, rank_in_pack = balanced_packing(tokens_per_phy, + num_gpus // num_nodes) + phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack + pphy2phy = inverse(phy2pphy) + + pphy2mlog = phy2mlog.gather( + -1, pphy2phy) # [num_layers * num_nodes, num_log_per_nodes] + pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange( + 0, + num_logical_experts, + num_logical_experts // num_nodes, + device=group_pack_index.device).view(1, -1, 1)).flatten(-2) + pphy2log = mlog2log.gather(-1, pphy2mlog) + pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1) + logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog) + return pphy2log, pphyrank, logcnt + + +def rebalance_experts( + weight: torch.Tensor, num_replicas: int, num_groups: int, + num_nodes: int, + num_gpus: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Entry point for expert-parallelism load balancer. + + Parameters: + weight: [layers, num_logical_experts], the load statistics for all logical experts + num_replicas: number of physical experts, must be a multiple of `num_gpus` + num_groups: number of expert groups + num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster + num_gpus: number of GPUs, must be a multiple of `num_nodes` + + Returns: + physical_to_logical_map: [layers, num_replicas], the expert index of each replica + logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert + expert_count: [layers, num_logical_experts], number of physical replicas for each logical expert + """ + num_layers, num_logical_experts = weight.shape + weight = weight.float().cpu() + if num_groups % num_nodes == 0: + # use hierarchical load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, num_groups, num_nodes, num_gpus) + else: + # use global load-balance policy + phy2log, phyrank, logcnt = rebalance_experts_hierarchical( + weight, num_replicas, 1, 1, num_gpus) + maxlogcnt = logcnt.max().item() + log2phy: torch.Tensor = torch.full( + (num_layers, num_logical_experts, maxlogcnt), + -1, + dtype=torch.int64, + device=logcnt.device) + log2phy.view(num_layers, -1).scatter_( + -1, phy2log * maxlogcnt + phyrank, + torch.arange(num_replicas, dtype=torch.int64, + device=log2phy.device).expand(num_layers, -1)) + return phy2log, log2phy, logcnt + + +__all__ = ['rebalance_experts'] diff --git a/examples/eplb/eplb_strategy.py b/examples/eplb/eplb_strategy.py new file mode 100644 index 0000000..bcccbf2 --- /dev/null +++ b/examples/eplb/eplb_strategy.py @@ -0,0 +1,186 @@ +# coding=utf-8 +# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. +import json +import logging +import os + +import matplotlib.pyplot as plt # type: ignore +import numpy as np +import torch + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +logger = logging.getLogger("msit_logger") + + +def save_matrix_to_json(output_path, file_name, deployment): + num_layers = deployment.shape[0] + num_cards = deployment.shape[1] + + data = {"moe_layer_count": num_layers} + layer_list = [] + for i in range(num_layers): + layer = {"layer_id": i, "device_count": num_cards} + device_list = [] + for j in range(num_cards): + device = { + "device_id": j, + "device_expert": deployment[i, j].tolist() + } + device_list.append(device) + layer["device_list"] = device_list + layer_list.append(layer) + data["layer_list"] = layer_list + + file_name = f"{output_path}{file_name}.json" + + # Save as JSON file + try: + with open(file_name, 'w') as f: + json.dump(data, f, indent=4) + except Exception as e: + print(f"write {file_name} failed: {e}") + + +def calculate_average(lst): + """calculate the average of a list""" + if not lst: + raise ValueError("list is empty") + + total = 0.0 + count = 0 + + for element in lst: + # Check if element is numeric + if isinstance(element, (int, float, np.int64, np.float64)): + total += float(element) + count += 1 + else: + # Non-numeric elements will be ignored with a warning + print(f"warning: element {element} is not a number, ignored") + + if count == 0: + raise ValueError("list does not contain any number") + + return total / count + + +def layer_imblance_polt(y_list, label_names, device_num, output_path, + file_name): + + plt.rcParams['font.sans-serif'] = ['Arial'] + plt.rcParams['axes.unicode_minus'] = False + x = [i for i in range(58)] + for index, y in enumerate(y_list): + plt.plot(x, + y, + label=rf'{label_names[index]},avg={calculate_average(y)}') + + plt.legend() + plt.title(rf'Load Distribution (num_gpus={device_num})') + plt.xlabel('layer') + plt.ylabel('Device Load') + + # Show grid lines + plt.grid(True) + + plt.savefig(os.path.join(output_path, file_name), dpi=300) + + # Clear current plot + plt.close() + + +def deepseek_deploy(workload, num_redundancy_expert, num_groups, num_nodes, + num_gpus, num_original_expert): + from eplb_deepseek import rebalance_experts + num_replicas = num_original_expert + num_redundancy_expert + hy2log, log2phy, logcnt = rebalance_experts(workload, num_replicas, + num_groups, num_nodes, + num_gpus) + + # Convert to global_deployment + workload = workload.cpu().numpy() + global_deployment = [] + layer_num = log2phy.shape[0] + num_physical_experts_local = (num_original_expert + + num_redundancy_expert) // num_gpus + for layer_idx in range(layer_num): + layer_deployment = [] + for gpu_idx in range(num_gpus): + local_deployment = hy2log[layer_idx][gpu_idx * + num_physical_experts_local: + (gpu_idx + 1) * + num_physical_experts_local] + local_deployment = local_deployment.flatten() + layer_deployment.append(local_deployment.tolist()) + global_deployment.append(layer_deployment) + + # Remap expert distribution according to log2phy + original_weights = [] + max_weights = [] + average_weights = [] + y_list = [] + for layer_idx in range(layer_num): + new_value = workload[layer_idx].reshape(num_gpus, -1) + row_sum = np.sum(new_value, axis=1) + original_weights.append(row_sum.max()) + average_weights.append((np.sum(workload[layer_idx]) / num_gpus)) + + opt_workload = np.zeros((num_original_expert + num_redundancy_expert), + dtype=np.float64) + for expert_idx in range(num_original_expert): + physical_expert_idxs = log2phy[layer_idx][expert_idx] + physical_expert_idxs = physical_expert_idxs.flatten() + physical_expert_idxs = physical_expert_idxs[ + physical_expert_idxs != -1] + for physical_expert_idx in physical_expert_idxs: + opt_workload[physical_expert_idx] += workload[layer_idx][ + expert_idx] / len(physical_expert_idxs) + opt_workload = opt_workload.reshape(num_gpus, -1) + row_sum = np.sum(opt_workload, axis=1) + max_weights.append(row_sum.max()) + + y_list = [original_weights, max_weights, average_weights] + return global_deployment, y_list + + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--exp_name", type=str, default="gsm8k_temp0.0") + parser.add_argument("--num_original_expert", type=int, default=256) + parser.add_argument("--input_path", type=str, default="") + parser.add_argument("--output_path", type=str, default="") + parser.add_argument("--num_redundancy_expert", type=int, default=0) + parser.add_argument("--num_devices", type=int, default=32) + parser.add_argument("--num_groups", type=int, default=8) + parser.add_argument("--num_nodes", type=int, default=4) + args = parser.parse_args() + exp_name = args.exp_name + input_path = args.input_path + output_path = args.output_path + os.makedirs(output_path, exist_ok=True) + num_redundancy_expert = args.num_redundancy_expert + num_devices = args.num_devices + num_original_expert = args.num_original_expert + num_groups = args.num_groups + num_nodes = args.num_nodes + + # NOTE: assume input workload format: [layer_num, num_experts] + workload = torch.load(input_path, map_location=torch.device('cpu')) + global_deployment, y_list = deepseek_deploy(workload, + num_redundancy_expert, + num_groups, num_nodes, + num_devices, + num_original_expert) + + file_name = f"{exp_name}_{num_devices}_{num_redundancy_expert}" + save_matrix_to_json(output_path, file_name, np.array(global_deployment)) + label_names = [ + 'default deployment max load', 'balanced load max load', + 'balanced load avg load' + ] + new_file_name = f"{exp_name}_{num_devices}_{num_redundancy_expert}.png" + layer_imblance_polt(y_list, label_names, num_devices, output_path, + new_file_name) diff --git a/examples/external_online_dp/README.md b/examples/external_online_dp/README.md new file mode 100644 index 0000000..4681de5 --- /dev/null +++ b/examples/external_online_dp/README.md @@ -0,0 +1,38 @@ +Here is an example guiding how to use `launch_online_dp.py` to launch external dp server in vllm. User can easily launch external dp server following the steps below: + +### Modify parameters in `run_dp_template.sh` +`run_dp_template.sh` is an template script used to launch each dp vllm instance separately. It will be called by `launch_online_dp.py` in multi threads and most of its configurations are set by `launch_online_dp.py`. Parameters you need to set manually include: + +1. The IP and socket_ifname of your machine. If running on multi-nodes, please make sure the scripts on each node has been set with correct IP and socket_ifname of that node. +2. vLLM serving related parameters including model_path and other configurations. Note that port, dp-related parammeters and tp_size is set by `launch_online_dp.py`, all the other vLLM parameters in this file only serve as an example and you are free to modify them according to your purpose. + +### Run `launch_online_dp.py` with CL arguments +All the arguments that can be set by users are: + +1. `--dp-size`: global data parallel size, must be set +2. `--tp-size`: tensor parallel size, default 1 +3. `--dp-size-local`: local data parallel size, defaultly set to `dp_size` +4. `--dp-rank-start`: Starting rank for data parallel, default 0 +5. `--dp-address`: IP address of data parallel master node +6. `--dp-rpc-port`: Port of data parallel master node, default 12345 +7. `--vllm-start-port`: Starting port of vLLM serving instances, default 9000 + +An example of running external DP in one single node: +```(python) +cd examples/external_online_dp +# running DP4 TP4 in a node with 16 NPUs +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 4 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342 +``` + +An example of running external DP in two nodes: +```(python) +cd examples/external_online_dp +# running DP4 TP4 in two nodes with 8 NPUs each + +# On node 0: +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 0 --dp-address x.x.x.x --dp-rpc-port 12342 + +# On node 1: +python launch_online_dp.py --dp-size 4 --tp-size 4 --dp-size-local 2 --dp-rank-start 2 --dp-address x.x.x.x --dp-rpc-port 12342 +``` + diff --git a/examples/external_online_dp/launch_online_dp.py b/examples/external_online_dp/launch_online_dp.py new file mode 100644 index 0000000..0045ecd --- /dev/null +++ b/examples/external_online_dp/launch_online_dp.py @@ -0,0 +1,97 @@ +import argparse +import multiprocessing +import os +import subprocess +import sys + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dp-size", + type=int, + required=True, + help="Data parallel size." + ) + parser.add_argument( + "--tp-size", + type=int, + default=1, + help="Tensor parallel size." + ) + parser.add_argument( + "--dp-size-local", + type=int, + default=-1, + help="Local data parallel size." + ) + parser.add_argument( + "--dp-rank-start", + type=int, + default=0, + help="Starting rank for data parallel." + ) + parser.add_argument( + "--dp-address", + type=str, + required=True, + help="IP address for data parallel master node." + ) + parser.add_argument( + "--dp-rpc-port", + type=str, + default=12345, + help="Port for data parallel master node." + ) + parser.add_argument( + "--vllm-start-port", + type=int, + default=9000, + help="Starting port for the engine." + ) + return parser.parse_args() + +args = parse_args() +dp_size = args.dp_size +tp_size = args.tp_size +dp_size_local = args.dp_size_local +if dp_size_local == -1: + dp_size_local = dp_size +dp_rank_start = args.dp_rank_start +dp_address = args.dp_address +dp_rpc_port = args.dp_rpc_port +vllm_start_port = args.vllm_start_port + +def run_command(visiable_devices, dp_rank, vllm_engine_port): + command = [ + "bash", + "./run_dp_template.sh", + visiable_devices, + str(vllm_engine_port), + str(dp_size), + str(dp_rank), + dp_address, + dp_rpc_port, + str(tp_size), + ] + subprocess.run(command, check=True) + +if __name__ == "__main__": + template_path = "./run_dp_template.sh" + if not os.path.exists(template_path): + print(f"Template file {template_path} does not exist.") + sys.exit(1) + + processes = [] + num_cards = dp_size_local * tp_size + for i in range(dp_size_local): + dp_rank = dp_rank_start + i + vllm_engine_port = vllm_start_port + i + visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size)) + process = multiprocessing.Process(target=run_command, + args=(visiable_devices, dp_rank, + vllm_engine_port)) + processes.append(process) + process.start() + + for process in processes: + process.join() \ No newline at end of file diff --git a/examples/external_online_dp/run_dp_template.sh b/examples/external_online_dp/run_dp_template.sh new file mode 100644 index 0000000..661bdfa --- /dev/null +++ b/examples/external_online_dp/run_dp_template.sh @@ -0,0 +1,46 @@ +export HCCL_IF_IP=your_ip_here +export GLOO_SOCKET_IFNAME=your_socket_ifname_here +export TP_SOCKET_IFNAME=your_socket_ifname_here +export HCCL_SOCKET_IFNAME=your_socket_ifname_here +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=your_rank_table_path_here +export VLLM_LOGGING_LEVEL="info" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=10 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_DETERMINISTIC=True +export HCCL_BUFFSIZE=1024 +export TASK_QUEUE_ENABLE=1 + +export VLLM_USE_V1=1 + +export ASCEND_RT_VISIBLE_DEVICES=$1 + +vllm serve model_path \ + --host 0.0.0.0 \ + --port $2 \ + --data-parallel-size $3 \ + --data-parallel-rank $4 \ + --data-parallel-address $5 \ + --data-parallel-rpc-port $6 \ + --tensor-parallel-size $7 \ + --enable-expert-parallel \ + --seed 1024 \ + --served-model-name dsv3 \ + --max-model-len 3500 \ + --max-num-batched-tokens 3500 \ + --max-num-seqs 28 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --quantization ascend \ + --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \ + --kv-transfer-config \ + '{"kv_connector": "LLMDataDistCMgrConnector", + "kv_buffer_device": "npu", + "kv_role": "kv_consumer", + "kv_parallel_size": "1", + "kv_port": "20001", + "engine_id": "0", + "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector" + }' \ + --additional-config \ + '{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}' \ No newline at end of file diff --git a/examples/offline_data_parallel.py b/examples/offline_data_parallel.py new file mode 100644 index 0000000..c5d0b3e --- /dev/null +++ b/examples/offline_data_parallel.py @@ -0,0 +1,257 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py +# +""" +Usage: +Single node: + Dense models: + python examples/offline_data_parallel.py \ + --model="Qwen/Qwen2.5-0.5B-Instruct" \ + --dp-size=2 \ + --tp-size=2 + MOE models: + python examples/offline_data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --enable-expert-parallel + +Multi-node: + Node 0 (assume the node has ip of 10.99.48.128): + python examples/offline_data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=0 \ + --enable-expert-parallel \ + --master-addr=10.99.48.128 \ + --master-port=13345 + Node 1: + python examples/offline_data_parallel.py \ + --model="ibm-research/PowerMoE-3b" \ + --dp-size=2 \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=1 \ + --enable-expert-parallel \ + --master-addr=10.99.48.128 \ + --master-port=13345 +""" + +import contextlib +import gc +import os +from time import sleep + +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import ( # noqa E402 + destroy_distributed_environment, destroy_model_parallel) +from vllm.utils import get_open_port + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser(description="Data Parallel Inference") + parser.add_argument( + "--model", + type=str, + default="ibm-research/PowerMoE-3b", + help="Model name or path", + ) + parser.add_argument("--dp-size", + type=int, + default=2, + help="Data parallel size") + parser.add_argument("--tp-size", + type=int, + default=1, + help="Tensor parallel size") + parser.add_argument("--node-size", + type=int, + default=1, + help="Total number of nodes") + parser.add_argument("--node-rank", + type=int, + default=0, + help="Rank of the current node") + parser.add_argument("--master-addr", + type=str, + default="", + help="Master node IP address") + parser.add_argument("--master-port", + type=int, + default=0, + help="Master node port") + parser.add_argument("--enforce-eager", + action="store_true", + help="Enforce eager mode execution.") + parser.add_argument("--trust-remote-code", + action="store_true", + help="Trust remote code.") + parser.add_argument("--enable-expert-parallel", + action="store_true", + help="Enable expert parallel, used in MOE models.") + return parser.parse_args() + + +def cleanup_env_and_memory(): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + +def main( + model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + GPUs_per_dp_rank, + enable_expert_parallel, + enforce_eager, + trust_remote_code, +): + # DP only support on V1 engine + os.environ["VLLM_DP_RANK"] = str(global_dp_rank) + os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) + os.environ["VLLM_DP_SIZE"] = str(dp_size) + os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip + os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port) + + # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the + # engine processes. + + # Sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] * 100 + + # with DP, each rank should process different prompts. + # usually all the DP ranks process a full dataset, + # and each rank processes a different part of the dataset. + floor = len(prompts) // dp_size + remainder = len(prompts) % dp_size + + # Distribute prompts into even groups. + def start(rank): + return rank * floor + min(rank, remainder) + + prompts = prompts[start(global_dp_rank):start(global_dp_rank + 1)] + if len(prompts) == 0: + # if any rank has no prompts to process, + # we need to set a placeholder prompt + prompts = ["Placeholder"] + print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts") + + # Create a sampling params object. + # since we are doing data parallel, every rank can have different + # sampling params. here we set different max_tokens for different + # ranks for demonstration. + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=[16, 20][global_dp_rank % 2]) + + # Create an LLM. + llm = LLM( + model=model, + tensor_parallel_size=GPUs_per_dp_rank, + enforce_eager=enforce_eager, + enable_expert_parallel=enable_expert_parallel, + trust_remote_code=trust_remote_code, + ) + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + if i >= 5: + # print only 5 outputs + break + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + + # Give engines time to pause their processing loops before exiting. + sleep(5) + del llm + cleanup_env_and_memory() + +if __name__ == "__main__": + args = parse_args() + + dp_size = args.dp_size + tp_size = args.tp_size + node_size = args.node_size + node_rank = args.node_rank + + if node_size == 1: + dp_master_ip = "127.0.0.1" + dp_master_port = get_open_port() + else: + dp_master_ip = args.master_addr + dp_master_port = args.master_port + + assert dp_size % node_size == 0, "dp_size should be divisible by node_size" + dp_per_node = dp_size // node_size + + from multiprocessing import Process + + procs = [] + for local_dp_rank, global_dp_rank in enumerate( + range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)): + proc = Process( + target=main, + args=( + args.model, + dp_size, + local_dp_rank, + global_dp_rank, + dp_master_ip, + dp_master_port, + tp_size, + args.enable_expert_parallel, + args.enforce_eager, + args.trust_remote_code, + ), + ) + proc.start() + procs.append(proc) + exit_code = 0 + for proc in procs: + proc.join(timeout=300) + if proc.exitcode is None: + print( + f"Killing process {proc.pid} that didn't stop within 5 minutes." + ) + proc.kill() + exit_code = 1 + elif proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/examples/offline_disaggregated_prefill_npu.py b/examples/offline_disaggregated_prefill_npu.py new file mode 100644 index 0000000..f37b508 --- /dev/null +++ b/examples/offline_disaggregated_prefill_npu.py @@ -0,0 +1,147 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import multiprocessing as mp +import os +import time +from multiprocessing import Event, Process + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +def clean_up(): + import gc + + import torch + from vllm.distributed.parallel_state import ( + destroy_distributed_environment, destroy_model_parallel) + destroy_model_parallel() + destroy_distributed_environment() + gc.collect() + torch.npu.empty_cache() + + +def run_prefill(prefill_done, process_close): + # ranktable.json needs be generated using gen_ranktable.sh + # from the examples/disaggregated_prefill_v1 in the main branch. + os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0" + + from vllm import LLM, SamplingParams + from vllm.config import KVTransferConfig + + prompts = [ + "Hello, how are you today?", "Hi, what is your name?", + "Tell me a very long story.", "what is your favourite book?" + ] + sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) + + ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_producer", + kv_parallel_size=1, + kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector") + # Set NPU memory utilization to 0.8 + llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + tensor_parallel_size=1) + + llm.generate(prompts, sampling_params) + print("Prefill node is finished.") + prefill_done.set() + + # To keep the prefill node running in case the decode node is not done + # otherwise, the script might exit prematurely, causing incomplete decoding. + try: + while not process_close.is_set(): + time.sleep(1) + except KeyboardInterrupt: + print("Script stopped by user.") + finally: + print("Cleanup prefill resources") + del llm + clean_up() + + +def run_decode(prefill_done): + os.environ['VLLM_LLMDD_RPC_PORT'] = '6634' + # ranktable.json needs be generated using gen_ranktable.sh + # from the examples/disaggregated_prefill_v1 module in the main branch. + os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json" + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "1" + + from vllm import LLM, SamplingParams + from vllm.config import KVTransferConfig + + prompts = [ + "Hello, how are you today?", "Hi, what is your name?", + "Tell me a very long story.", "what is your favourite book?" + ] + sampling_params = SamplingParams(temperature=0, top_p=0.95) + + ktc = KVTransferConfig(kv_connector="LLMDataDistCMgrConnector", kv_buffer_device="npu", kv_role="kv_consumer", + kv_parallel_size=1, kv_connector_module_path="vllm_ascend.distributed.llmdatadist_c_mgr_connector") + + llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + kv_transfer_config=ktc, + max_model_len=2000, + gpu_memory_utilization=0.8, + tensor_parallel_size=1) + + # Wait for the producer to start the consumer + print("Waiting for prefill node to finish...") + prefill_done.wait() + + # At this point when the prefill_done is set, the kv-cache should have been + # transferred to this decode node, so we can start decoding. + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + del llm + clean_up() + + +if __name__ == "__main__": + mp.get_context('spawn') + + prefill_done = Event() + process_close = Event() + prefill_process = Process(target=run_prefill, + args=( + prefill_done, + process_close, + )) + decode_process = Process(target=run_decode, args=(prefill_done, )) + + # Start prefill node + prefill_process.start() + + # Start decode node + decode_process.start() + + # Terminate the prefill node when decode is finished + decode_process.join() + + # Terminate prefill process + process_close.set() + prefill_process.join() + prefill_process.terminate() + print("All process done!") diff --git a/examples/offline_dualbatch_overlap_npu.py b/examples/offline_dualbatch_overlap_npu.py new file mode 100644 index 0000000..3829d6a --- /dev/null +++ b/examples/offline_dualbatch_overlap_npu.py @@ -0,0 +1,52 @@ +import os +import time + +from vllm import LLM, SamplingParams + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +# enable dual-batch overlap for vllm ascend +os.environ["VLLM_ASCEND_ENABLE_DBO"] = "1" + +# Sample prompts. +prompts = ["The president of the United States is"] * 41 +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + + +def main(): + # Create an LLM. + llm = LLM(model="deepseek-ai/DeepSeek-V3-Lite-base-latest-w8a8-dynamic", + enforce_eager=True, + tensor_parallel_size=2, + max_model_len=4096, + trust_remote_code=True, + enable_expert_parallel=True, + additional_config={ + "torchair_graph_config": { + "enabled": False + }, + "ascend_scheduler_config": { + "enabled": True + }, + }) + + # Generate texts from the prompts. The output is a list of RequestOutput + # objects that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + print("-" * 50) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + # Add a buffer to wait for profiler in the background process + # (in case MP is on) to finish writing profiling output. + time.sleep(10) + + +if __name__ == "__main__": + main() diff --git a/examples/offline_embed.py b/examples/offline_embed.py new file mode 100644 index 0000000..7707e5f --- /dev/null +++ b/examples/offline_embed.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://www.modelscope.cn/models/Qwen/Qwen3-Embedding-0.6B +# + +import os + +import torch +from vllm import LLM + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +def get_detailed_instruct(task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + +def main(): + # Each query must come with a one-sentence instruction that describes the task + task = 'Given a web search query, retrieve relevant passages that answer the query' + + queries = [ + get_detailed_instruct(task, 'What is the capital of China?'), + get_detailed_instruct(task, 'Explain gravity') + ] + # No need to add instruction for retrieval documents + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." + ] + input_texts = queries + documents + + model = LLM(model="Qwen/Qwen3-Embedding-0.6B", task="embed") + + outputs = model.embed(input_texts) + embeddings = torch.tensor([o.outputs.embedding for o in outputs]) + # Calculate the similarity scores between the first two queries and the last two documents + scores = (embeddings[:2] @ embeddings[2:].T) + print(scores.tolist()) + # [[0.7620252966880798, 0.14078938961029053], [0.1358368694782257, 0.6013815999031067]] + + +if __name__ == "__main__": + main() diff --git a/examples/offline_external_launcher.py b/examples/offline_external_launcher.py new file mode 100644 index 0000000..4566fdc --- /dev/null +++ b/examples/offline_external_launcher.py @@ -0,0 +1,287 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py + +# Note: This script is designed to run with e2e test, +# please be careful to modify it. +""" +Usage: +Single node: + Dense models: + python examples/offline_external_launcher.py \ + --model="Qwen/Qwen2.5-0.5B-Instruct" \ + --tp-size=1 \ + --proc-per-node=2 + MOE models: + python examples/offline_external_launcher.py \ + --model="Qwen/Qwen3-30B-A3B" \ + --tp-size=2 \ + --proc-per-node=2 \ + --enable-expert-parallel + +Multi-node: + Node 0 (assume the node has ip of 10.99.48.128): + python examples/offline_external_launcher.py \ + --model="Qwen/Qwen3-30B-A3B" \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=0 \ + --proc-per-node=2 \ + --enable-expert-parallel \ + --master-addr=10.99.48.128 \ + --master-port=13345 + Node 1: + python examples/offline_external_launcher.py \ + --model="Qwen/Qwen3-30B-A3B" \ + --tp-size=2 \ + --node-size=2 \ + --node-rank=1 \ + --enable-expert-parallel \ + --master-addr=10.99.48.128 \ + --master-port=13345 +""" + +import argparse +import contextlib +import gc +import os +from multiprocessing import Process +from time import sleep + +import torch +from vllm import LLM, SamplingParams +from vllm.distributed.parallel_state import ( # noqa E402 + destroy_distributed_environment, destroy_model_parallel, get_tp_group) +from vllm.utils import get_open_port, GiB_bytes + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def parse_args(): + + parser = argparse.ArgumentParser(description="External launcher Inference") + parser.add_argument( + "--model", + type=str, + default="Qwen/Qwen3-0.6B", + help="Model name or path", + ) + parser.add_argument("--tp-size", + type=int, + default=1, + help="Tensor parallel size") + parser.add_argument("--node-size", + type=int, + default=1, + help="Total number of nodes") + parser.add_argument("--node-rank", + type=int, + default=0, + help="Rank of the current node") + parser.add_argument("--proc-per-node", + type=int, + default=1, + help="Number of processes per node") + parser.add_argument("--master-addr", + type=str, + default="", + help="Master node IP address") + parser.add_argument("--master-port", + type=int, + default=0, + help="Master node port") + parser.add_argument("--enforce-eager", + action="store_true", + help="Enforce eager mode execution.") + parser.add_argument("--trust-remote-code", + action="store_true", + help="Trust remote code.") + parser.add_argument("--enable-expert-parallel", + action="store_true", + help="Enable expert parallel, used in MOE models.") + parser.add_argument("--enable-sleep-mode", + action="store_true", + help="Enable sleep mode for the engine.") + parser.add_argument("--temperature", + type=float, + default=0.8, + help="Float that controls the randomness of the sampling.") + parser.add_argument("--model-weight-gib", + type=float, + default=None, + help="Model weight memory usage in GiB (e.g., 1.0 for 0.5B model).") + + args = parser.parse_args() + if args.enable_sleep_mode: + if args.model_weight_gib is None or args.temperature != 0: + parser.error("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set.") + if args.model_weight_gib <= 0: + parser.error("model-weight-gib must be greater than 0 when enable-sleep-mode is set.") + if args.model == parser.get_default("model") and args.model_weight_gib is None: + parser.error("model-weight-gib must be provided for default model when enable-sleep-mode is set.") + + return args + + +def main( + local_rank: int, + rank: int, + master_addr: str, + master_port: int, + model_weight_gib: float, + model: str = "Qwen/Qwen3-0.6B", + world_size: int = 4, + tensor_parallel_size: int = 2, + enable_expert_parallel: bool = False, + enforce_eager: bool = False, + trust_remote_code: bool = True, + enable_sleep_mode: bool = False, + temperature: float = 0.8, +): + os.environ["MASTER_ADDR"] = master_addr + os.environ["MASTER_PORT"] = str(master_port) + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(world_size) + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group( + backend="cpu:gloo,npu:hccl", + world_size=world_size, + rank=rank, + ) + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] * 10 + sampling_params = SamplingParams( + temperature=temperature, + top_p=0.95, + max_tokens=10, + ) + llm = LLM( + model=model, + tensor_parallel_size=tensor_parallel_size, + enable_expert_parallel=enable_expert_parallel, + enforce_eager=enforce_eager, + trust_remote_code=trust_remote_code, + distributed_executor_backend="external_launcher", + seed=0, + enable_sleep_mode=enable_sleep_mode, + ) + tp_ranks = get_tp_group().ranks + print(f'TP RANKS: {tp_ranks}') + + outputs = llm.generate(prompts, sampling_params) + + if enable_sleep_mode: + if rank == 0: + free_bytes_before_sleep, total = torch.npu.mem_get_info() + llm.sleep(level=1) + if rank == 0: + free_bytes_after_sleep, total = torch.npu.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + print(f"Freed memory: {freed_bytes / 1024 ** 3:.2f} GiB") + # now the freed memory should be larger than the model weights + assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes + + llm.wake_up() + outputs_after_wakeup = llm.generate(prompts, sampling_params) + if rank == 0: + # cmp output + assert outputs[0].outputs[0].text == outputs_after_wakeup[0].outputs[0].text + print("Sleep and wake up successfully!!") + + for i, output in enumerate(outputs): + if i >= 5: + # print only 5 outputs + break + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Global rank: {rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + + # Give engines time to pause their processing loops before exiting. + sleep(5) + del llm + cleanup_env_and_memory() + + +def cleanup_env_and_memory(): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +if __name__ == "__main__": + args = parse_args() + + tp_size = args.tp_size + node_size = args.node_size + proc_per_node = args.proc_per_node + node_rank = args.node_rank + + if node_size == 1: + master_addr = "127.0.0.1" + master_port = get_open_port() + else: + master_addr = args.master_addr + master_port = args.master_port + + world_size = node_size * proc_per_node + + procs = [] + for local_rank, rank in enumerate( + range(proc_per_node * node_rank, proc_per_node * (node_rank + 1))): + proc = Process(target=main, + args=( + local_rank, + rank, + master_addr, + master_port, + args.model_weight_gib, + args.model, + world_size, + tp_size, + args.enable_expert_parallel, + args.enforce_eager, + args.trust_remote_code, + args.enable_sleep_mode, + args.temperature, + )) + + proc.start() + procs.append(proc) + exit_code = 0 + for proc in procs: + proc.join(timeout=600) + if proc.exitcode is None: + print( + f"Killing process {proc.pid} that didn't stop within 30 minutes." + ) + proc.kill() + exit_code = 1 + elif proc.exitcode: + exit_code = proc.exitcode + + exit(exit_code) diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py new file mode 100644 index 0000000..7cf36a9 --- /dev/null +++ b/examples/offline_inference_audio_language.py @@ -0,0 +1,105 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/audio_language.py +# +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on audio language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" + +import os +import argparse + +from vllm.assets.audio import AudioAsset +try: + import librosa # type: ignore +except ImportError: + raise Exception("Can't import librosa, please ensure it's installed") + +from vllm import LLM, SamplingParams + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def prepare_inputs(audio_count: int, audio_path1: str, audio_path2: str): + use_vllm_audio_assert = True if audio_path1 == "mary_had_lamb" and audio_path2 == "winning_call" else False + if use_vllm_audio_assert: + audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] + else: + audio_assets = [librosa.load(audio_path1, sr=None), librosa.load(audio_path2, sr=None)] + + question_per_audio_count = { + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" + } + + audio_in_prompt = "".join([ + f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + for idx in range(audio_count) + ]) + question = question_per_audio_count[audio_count] + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + + mm_data = { + "audio": + audio_assets if not use_vllm_audio_assert else [asset.audio_and_sample_rate for asset in audio_assets[:audio_count]] + } + + # Merge text prompt and audio data into inputs + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + return inputs + + +def main(audio_count: int, audio_path1: str, audio_path2: str): + # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on + # lower-end GPUs. + # Unless specified, these settings have been tested to work on a single L4. + # `limit_mm_per_prompt`: the max num items for each modality per prompt. + llm = LLM(model="Qwen/Qwen2-Audio-7B-Instruct", + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}, + enforce_eager=True) + + inputs = prepare_inputs(audio_count, audio_path1, audio_path2) + + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=None) + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print("generated_text:", generated_text) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Arguments of rank table generator", ) + parser.add_argument("--audio-path1", type=str, default="mary_had_lamb") + parser.add_argument("--audio-path2", type=str, default="winning_call") + args = parser.parse_args() + + audio_count = 2 + main(audio_count, args.audio_path1, args.audio_path2) diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py new file mode 100644 index 0000000..4630bd1 --- /dev/null +++ b/examples/offline_inference_npu.py @@ -0,0 +1,51 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# isort: skip_file +import os + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +from vllm import LLM, SamplingParams + + +def main(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + # Create an LLM. + llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct") + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference_npu_tp2.py b/examples/offline_inference_npu_tp2.py new file mode 100644 index 0000000..05082b0 --- /dev/null +++ b/examples/offline_inference_npu_tp2.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# isort: skip_file +import os + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +from vllm import LLM, SamplingParams + + +def main(): + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + # Create an LLM. + llm = LLM(model="deepseek-ai/DeepSeek-V2-Lite", + tensor_parallel_size=2, + enforce_eager=True, + trust_remote_code=True, + max_model_len=1024) + + # Generate texts from the prompts. + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + main() diff --git a/examples/offline_inference_sleep_mode_npu.py b/examples/offline_inference_sleep_mode_npu.py new file mode 100644 index 0000000..5ffcff6 --- /dev/null +++ b/examples/offline_inference_sleep_mode_npu.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os + +import torch +from vllm import LLM, SamplingParams +from vllm.utils import GiB_bytes + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +def main(): + prompt = "How are you?" + + free, total = torch.npu.mem_get_info() + print(f"Free memory before sleep: {free / 1024 ** 3:.2f} GiB") + # record npu memory use baseline in case other process is running + used_bytes_baseline = total - free + llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True) + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + llm.sleep(level=1) + + free_npu_bytes_after_sleep, total = torch.npu.mem_get_info() + print( + f"Free memory after sleep: {free_npu_bytes_after_sleep / 1024 ** 3:.2f} GiB" + ) + used_bytes = total - free_npu_bytes_after_sleep - used_bytes_baseline + # now the memory usage should be less than the model weights + # (0.5B model, 1GiB weights) + assert used_bytes < 1 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text + + +if __name__ == "__main__": + main() diff --git a/examples/prompt_embedding_inference.py b/examples/prompt_embedding_inference.py new file mode 100644 index 0000000..c953238 --- /dev/null +++ b/examples/prompt_embedding_inference.py @@ -0,0 +1,88 @@ +import os + +import torch +from transformers import (AutoModelForCausalLM, AutoTokenizer, + PreTrainedTokenizer) +from vllm import LLM + +os.environ["VLLM_USE_MODELSCOPE"] = "True" +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def init_tokenizer_and_llm(model_name: str): + tokenizer = AutoTokenizer.from_pretrained(model_name) + transformers_model = AutoModelForCausalLM.from_pretrained(model_name) + embedding_layer = transformers_model.get_input_embeddings() + llm = LLM(model=model_name, enable_prompt_embeds=True) + return tokenizer, embedding_layer, llm + + +def get_prompt_embeds(chat: list[dict[str, + str]], tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + token_ids = tokenizer.apply_chat_template(chat, + add_generation_prompt=True, + return_tensors='pt') + prompt_embeds = embedding_layer(token_ids).squeeze(0) + return prompt_embeds + + +def single_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chat = [{ + "role": "user", + "content": "Please tell me about the capital of France." + }] + prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer) + + outputs = llm.generate({ + "prompt_embeds": prompt_embeds, + }) + + print("\n[Single Inference Output]") + print("-" * 30) + for o in outputs: + print(o.outputs[0].text) + print("-" * 30) + + +def batch_prompt_inference(llm: LLM, tokenizer: PreTrainedTokenizer, + embedding_layer: torch.nn.Module): + chats = [[{ + "role": "user", + "content": "Please tell me about the capital of France." + }], + [{ + "role": "user", + "content": "When is the day longest during the year?" + }], + [{ + "role": "user", + "content": "Where is bigger, the moon or the sun?" + }]] + + prompt_embeds_list = [ + get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats + ] + + outputs = llm.generate([{ + "prompt_embeds": embeds + } for embeds in prompt_embeds_list]) + + print("\n[Batch Inference Outputs]") + print("-" * 30) + for i, o in enumerate(outputs): + print(f"Q{i+1}: {chats[i][0]['content']}") + print(f"A{i+1}: {o.outputs[0].text}\n") + print("-" * 30) + + +def main(): + model_name = "meta-llama/Llama-3.2-1B-Instruct" + tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name) + single_prompt_inference(llm, tokenizer, embedding_layer) + batch_prompt_inference(llm, tokenizer, embedding_layer) + + +if __name__ == "__main__": + main() diff --git a/examples/run_dp_server.sh b/examples/run_dp_server.sh new file mode 100644 index 0000000..1866fb0 --- /dev/null +++ b/examples/run_dp_server.sh @@ -0,0 +1,32 @@ + +export HCCL_IF_IP=2.0.0.0 +export GLOO_SOCKET_IFNAME="eth0" +export TP_SOCKET_IFNAME="eth0" +export HCCL_SOCKET_IFNAME="eth0" + +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 + +export VLLM_USE_V1=1 +export VLLM_USE_MODELSCOPE=true + +export ASCEND_LAUNCH_BLOCKING=0 + +vllm serve Qwen/Qwen1.5-MoE-A2.7B \ + --host 0.0.0.0 \ + --port 20002 \ + --served-model-name Qwen \ + --data-parallel-size 2 \ + --data-parallel-size-local 2 \ + --data-parallel-address 2.0.0.0 \ + --data-parallel-rpc-port 13389 \ + --tensor-parallel-size 4 \ + --enable-expert-parallel \ + --no-enable-prefix-caching \ + --max-num-seqs 16 \ + --max-model-len 4096 \ + --max-num-batched-tokens 4096 \ + --gpu-memory-utilization 0.9 \ + --trust-remote-code \ + --enforce-eager \ + --additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}' diff --git a/format.sh b/format.sh new file mode 100755 index 0000000..d083153 --- /dev/null +++ b/format.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "❓❓$1 is not installed, please run:" + echo "# Install lint deps" + echo "pip install -r requirements-lint.txt" + echo "# (optional) Enable git commit pre check" + echo "pre-commit install" + echo "" + echo "See step by step contribution guide:" + echo "https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution" + exit 1 + fi +} + +check_command pre-commit + +# TODO: cleanup SC exclude +export SHELLCHECK_OPTS="--exclude=SC2046,SC2006,SC2086" +if [[ "$1" != 'ci' ]]; then + pre-commit run --all-files +else + pre-commit run --all-files --hook-stage manual +fi diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..6fe8e6c --- /dev/null +++ b/mypy.ini @@ -0,0 +1,16 @@ +[mypy] +; warn_return_any = True +warn_unused_configs = True + +; Suppress all missing import errors from torch_npu for mypy. +[mypy-torch_npu.*] +ignore_missing_imports = True + +[mypy-torchair.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True + +[mypy-lm_eval.*] +ignore_missing_imports = True \ No newline at end of file diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000..f23d3be --- /dev/null +++ b/packages.txt @@ -0,0 +1,5 @@ +git +vim +wget +jq +curl diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1a140ce --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,34 @@ +[build-system] +# Should be mirrored in requirements.txt +requires = [ + "cmake>=3.26", + "decorator", + "einops", + "numpy<2.0.0", + "packaging", + "pip", + "pybind11", + "pyyaml", + "scipy", + "setuptools>=64", + "setuptools-scm>=8", + "torch-npu==2.7.1.dev20250724", + "torch>=2.7.1", + "torchvision", + "wheel", + "msgpack", + "quart", + "numba", +] +build-backend = "setuptools.build_meta" + +[tool.pymarkdown] +plugins.md004.style = "sublist" # ul-style +plugins.md007.indent = 4 # ul-indent +plugins.md007.start_indented = true # ul-indent +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md046.enabled = false # code-block-style +plugins.md024.allow_different_nesting = true # no-duplicate-headers +plugins.md029.enabled = false # ol-prefix diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..9be7f39 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,20 @@ +-r requirements-lint.txt +-r requirements.txt +modelscope +openai +pytest >= 6.0 +pytest-asyncio +pytest-mock +lm-eval==0.4.8 +types-jsonschema +xgrammar +zmq +types-psutil +pytest-cov +regex +sentence_transformers +ray>=2.47.1 +protobuf>3.20.0 +librosa +soundfile +pytest_mock \ No newline at end of file diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 0000000..8a575e5 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,9 @@ +# formatting +pre-commit==4.0.1 + +# type checking +mypy==1.11.1 +types-PyYAML +types-regex +types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7808e85 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,27 @@ +# Should be mirrored in pyporject.toml +cmake>=3.26 +decorator +einops +numpy<2.0.0 +packaging +pip +pybind11 +pyyaml +scipy +setuptools>=64 +setuptools-scm>=8 +torch>=2.7.1 +torchvision +wheel + +# requirements for disaggregated prefill +msgpack +quart + +# Required for N-gram speculative decoding +numba + +# Install torch_npu +--pre +--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi +torch-npu==2.7.1.dev20250724 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..351efa1 --- /dev/null +++ b/setup.py @@ -0,0 +1,397 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/blob/main/setup.py +# + +import importlib.util +import logging +import os +import subprocess +import sys +from sysconfig import get_paths +from typing import Dict, List + +from setuptools import Extension, find_packages, setup +from setuptools.command.build_ext import build_ext +from setuptools.command.build_py import build_py +from setuptools.command.develop import develop +from setuptools.command.install import install +from setuptools_scm import get_version + + +def load_module_from_path(module_name, path): + spec = importlib.util.spec_from_file_location(module_name, path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + +ROOT_DIR = os.path.dirname(__file__) +logger = logging.getLogger(__name__) + + +def check_or_set_default_env(cmake_args, + env_name, + env_variable, + default_path=""): + if env_variable is None: + logging.warning( + f"No {env_name} found in your environment, pleause try to set {env_name} " + "if you customize the installation path of this library, otherwise default " + "path will be adapted during build this project") + logging.warning(f"Set default {env_name}: {default_path}") + env_variable = default_path + else: + logging.info(f"Found existing {env_name}: {env_variable}") + # cann package seems will check this environments in cmake, need write this env variable back. + if env_name == "ASCEND_HOME_PATH": + os.environ["ASCEND_HOME_PATH"] = env_variable + cmake_args += [f"-D{env_name}={env_variable}"] + return cmake_args + + +envs = load_module_from_path("envs", + os.path.join(ROOT_DIR, "vllm_ascend", "envs.py")) + + +class CMakeExtension(Extension): + + def __init__(self, + name: str, + cmake_lists_dir: str = ".", + **kwargs) -> None: + super().__init__(name, sources=[], py_limited_api=False, **kwargs) + self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) + + +class custom_build_info(build_py): + + def run(self): + soc_version = envs.SOC_VERSION + if not soc_version: + raise ValueError( + "SOC version is not set. Please set SOC_VERSION environment variable." + ) + if "310" in soc_version and not envs.COMPILE_CUSTOM_KERNELS: + raise ValueError( + "SOC version 310 only supports custom kernels. Please set COMPILE_CUSTOM_KERNELS=1 to enable custom kernels." + ) + + package_dir = os.path.join(ROOT_DIR, "vllm_ascend", "_build_info.py") + with open(package_dir, "w+") as f: + f.write('# Auto-generated file\n') + f.write(f"__soc_version__ = '{soc_version}'\n") + f.write( + f"__sleep_mode_enabled__ = {envs.COMPILE_CUSTOM_KERNELS}\n") + logging.info( + f"Generated _build_info.py with SOC version: {soc_version}") + super().run() + + +class cmake_build_ext(build_ext): + # A dict of extension directories that have been configured. + did_config: Dict[str, bool] = {} + + # + # Determine number of compilation jobs + # + def compute_num_jobs(self): + # `num_jobs` is either the value of the MAX_JOBS environment variable + # (if defined) or the number of CPUs available. + num_jobs = envs.MAX_JOBS + if num_jobs is not None: + num_jobs = int(num_jobs) + logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs) + else: + try: + # os.sched_getaffinity() isn't universally available, so fall + # back to os.cpu_count() if we get an error here. + num_jobs = len(os.sched_getaffinity(0)) + except AttributeError: + num_jobs = os.cpu_count() + num_jobs = max(1, num_jobs) + + return num_jobs + + # + # Perform cmake configuration for a single extension. + # + def configure(self, ext: CMakeExtension) -> None: + build_temp = self.build_temp + os.makedirs(build_temp, exist_ok=True) + source_dir = os.path.abspath(ROOT_DIR) + python_executable = sys.executable + cmake_args = ["cmake"] + # Default use release mode to compile the csrc code + # Turbo now support compiled with Release, Debug and RelWithDebugInfo + if envs.CMAKE_BUILD_TYPE is None or envs.CMAKE_BUILD_TYPE not in [ + "Debug", + "Release", + "RelWithDebugInfo", + ]: + envs.CMAKE_BUILD_TYPE = "Release" + cmake_args += [f"-DCMAKE_BUILD_TYPE={envs.CMAKE_BUILD_TYPE}"] + # Default dump the compile commands for lsp + cmake_args += ["-DCMAKE_EXPORT_COMPILE_COMMANDS=1"] + if envs.CXX_COMPILER is not None: + cmake_args += [f"-DCMAKE_CXX_COMPILER={envs.CXX_COMPILER}"] + if envs.C_COMPILER is not None: + cmake_args += [f"-DCMAKE_C_COMPILER={envs.C_COMPILER}"] + if envs.VERBOSE: + cmake_args += ["-DCMAKE_VERBOSE_MAKEFILE=ON"] + + # find ASCEND_HOME_PATH + check_or_set_default_env( + cmake_args, + "ASCEND_HOME_PATH", + envs.ASCEND_HOME_PATH, + "/usr/local/Ascend/ascend-toolkit/latest", + ) + + # find PYTHON_EXECUTABLE + check_or_set_default_env(cmake_args, "PYTHON_EXECUTABLE", + sys.executable) + + # find PYTHON_INCLUDE_PATH + check_or_set_default_env(cmake_args, "PYTHON_INCLUDE_PATH", + get_paths()["include"]) + + # ccache and ninja can not be applied at ascendc kernels now + + try: + # if pybind11 is installed via pip + pybind11_cmake_path = (subprocess.check_output( + [python_executable, "-m", "pybind11", + "--cmakedir"]).decode().strip()) + except subprocess.CalledProcessError as e: + # else specify pybind11 path installed from source code on CI container + raise RuntimeError(f"CMake configuration failed: {e}") + + install_path = os.path.join(ROOT_DIR, self.build_lib) + if isinstance(self.distribution.get_command_obj("develop"), develop): + install_path = os.path.join(ROOT_DIR, "vllm_ascend") + # add CMAKE_INSTALL_PATH + cmake_args += [f"-DCMAKE_INSTALL_PREFIX={install_path}"] + + cmake_args += [f"-DCMAKE_PREFIX_PATH={pybind11_cmake_path}"] + + cmake_args += [f"-DSOC_VERSION={envs.SOC_VERSION}"] + + # Override the base directory for FetchContent downloads to $ROOT/.deps + # This allows sharing dependencies between profiles, + # and plays more nicely with sccache. + # To override this, set the FETCHCONTENT_BASE_DIR environment variable. + fc_base_dir = os.path.join(ROOT_DIR, ".deps") + fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) + cmake_args += ["-DFETCHCONTENT_BASE_DIR={}".format(fc_base_dir)] + + torch_npu_command = "python3 -m pip show torch-npu | grep '^Location:' | awk '{print $2}'" + try: + torch_npu_path = subprocess.check_output( + torch_npu_command, shell=True).decode().strip() + torch_npu_path += "/torch_npu" + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Retrieve torch version version failed: {e}") + + # add TORCH_NPU_PATH + cmake_args += [f"-DTORCH_NPU_PATH={torch_npu_path}"] + + build_tool = [] + # TODO(ganyi): ninja and ccache support for ascend c auto codegen. now we can only use make build + # if which('ninja') is not None: + # build_tool += ['-G', 'Ninja'] + # Default build tool to whatever cmake picks. + + cmake_args += [source_dir] + logging.info(f"cmake config command: {cmake_args}") + try: + subprocess.check_call(cmake_args, cwd=self.build_temp) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"CMake configuration failed: {e}") + + subprocess.check_call( + ["cmake", ext.cmake_lists_dir, *build_tool, *cmake_args], + cwd=self.build_temp, + ) + + def build_extensions(self) -> None: + if not envs.COMPILE_CUSTOM_KERNELS: + return + # Ensure that CMake is present and working + try: + subprocess.check_output(["cmake", "--version"]) + except OSError as e: + raise RuntimeError(f"Cannot find CMake executable: {e}") + + # Create build directory if it does not exist. + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) + + targets = [] + + os.makedirs(os.path.join(self.build_lib, "vllm_ascend"), exist_ok=True) + + def target_name(s: str) -> str: + return s.removeprefix("vllm_ascend.") + + # Build all the extensions + for ext in self.extensions: + self.configure(ext) + targets.append(target_name(ext.name)) + + num_jobs = self.compute_num_jobs() + + build_args = [ + "--build", + ".", + f"-j={num_jobs}", + *[f"--target={name}" for name in targets], + ] + try: + subprocess.check_call(["cmake", *build_args], cwd=self.build_temp) + except OSError as e: + raise RuntimeError(f"Build library failed: {e}") + # Install the libraries + install_args = [ + "cmake", + "--install", + ".", + ] + try: + subprocess.check_call(install_args, cwd=self.build_temp) + except OSError as e: + raise RuntimeError(f"Install library failed: {e}") + + # copy back to build folder for editable build + if isinstance(self.distribution.get_command_obj("develop"), develop): + import shutil + for root, _, files in os.walk(self.build_temp): + for file in files: + if file.endswith(".so"): + src_path = os.path.join(root, file) + dst_path = os.path.join(self.build_lib, "vllm_ascend", + file) + shutil.copy(src_path, dst_path) + print(f"Copy: {src_path} -> {dst_path}") + + def run(self): + # First, run the standard build_ext command to compile the extensions + super().run() + + +class custom_install(install): + + def run(self): + self.run_command("build_ext") + install.run(self) + + +ROOT_DIR = os.path.dirname(__file__) +try: + VERSION = get_version(write_to="vllm_ascend/_version.py") +except LookupError: + # The checkout action in github action CI does not checkout the tag. It + # only checks out the commit. In this case, we set a dummy version. + VERSION = "0.0.0" + +ext_modules = [] +if envs.COMPILE_CUSTOM_KERNELS: + ext_modules = [CMakeExtension(name="vllm_ascend.vllm_ascend_C")] + + +def get_path(*filepath) -> str: + return os.path.join(ROOT_DIR, *filepath) + + +def read_readme() -> str: + """Read the README file if present.""" + p = get_path("README.md") + if os.path.isfile(p): + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() + else: + return "" + + +def get_requirements() -> List[str]: + """Get Python package dependencies from requirements.txt.""" + + def _read_requirements(filename: str) -> List[str]: + with open(get_path(filename)) as f: + requirements = f.read().strip().split("\n") + resolved_requirements = [] + for line in requirements: + if line.startswith("-r "): + resolved_requirements += _read_requirements(line.split()[1]) + elif line.startswith("--"): + continue + else: + resolved_requirements.append(line) + return resolved_requirements + + try: + requirements = _read_requirements("requirements.txt") + except ValueError: + print("Failed to read requirements.txt in vllm_ascend.") + return requirements + + +cmdclass = { + "build_py": custom_build_info, + "build_ext": cmake_build_ext, + "install": custom_install +} + +setup( + name="vllm_ascend", + # Follow: + # https://packaging.python.org/en/latest/specifications/version-specifiers + version=VERSION, + author="vLLM-Ascend team", + license="Apache 2.0", + description="vLLM Ascend backend plugin", + long_description=read_readme(), + long_description_content_type="text/markdown", + url="https://github.com/vllm-project/vllm-ascend", + project_urls={ + "Homepage": "https://github.com/vllm-project/vllm-ascend", + }, + # TODO: Add 3.12 back when torch-npu support 3.12 + classifiers=[ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + ], + packages=find_packages(exclude=("docs", "examples", "tests*", "csrc")), + python_requires=">=3.9", + install_requires=get_requirements(), + ext_modules=ext_modules, + cmdclass=cmdclass, + extras_require={}, + entry_points={ + "vllm.platform_plugins": ["ascend = vllm_ascend:register"], + "vllm.general_plugins": + ["ascend_enhanced_model = vllm_ascend:register_model"], + }, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/310p/test_offline_inference_310p.py b/tests/e2e/310p/test_offline_inference_310p.py new file mode 100644 index 0000000..31f7eb9 --- /dev/null +++ b/tests/e2e/310p/test_offline_inference_310p.py @@ -0,0 +1,72 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +import pytest +import vllm # noqa: F401 +from vllm import SamplingParams + +import vllm_ascend # noqa: F401 +from tests.e2e.conftest import VllmRunner + +MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen2.5-7B-Instruct"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +def test_models(model: str, dtype: str, max_tokens: int) -> None: + example_prompts = [ + "Hello, my name is", + "The future of AI is", + ] + + with VllmRunner(model, + tensor_parallel_size=1, + dtype=dtype, + max_model_len=2048, + enforce_eager=True, + compilation_config={ + "custom_ops": + ["none", "+rms_norm", "+rotary_embedding"] + }) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +VL_MODELS = ["Qwen/Qwen2.5-VL-3B-Instruct"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float16"]) +def test_vl_model_with_samples(model: str, dtype: str) -> None: + example_prompts = [ + "Hello, my name is", + "The future of AI is", + ] + + with VllmRunner(model, + tensor_parallel_size=1, + dtype=dtype, + max_model_len=2048, + enforce_eager=True, + compilation_config={ + "custom_ops": + ["none", "+rms_norm", "+rotary_embedding"] + }) as vllm_model: + sampling_params = SamplingParams(max_tokens=100, + top_p=0.95, + top_k=50, + temperature=0.6) + vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/310p/test_offline_inference_parallel_310p.py b/tests/e2e/310p/test_offline_inference_parallel_310p.py new file mode 100644 index 0000000..6bf3356 --- /dev/null +++ b/tests/e2e/310p/test_offline_inference_parallel_310p.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +import pytest +import vllm # noqa: F401 + +import vllm_ascend # noqa: F401 +from tests.e2e.conftest import VllmRunner + +# Pangu local model path +MODELS = [ + "IntervitensInc/pangu-pro-moe-model", +] +# set additional config for ascend scheduler and torchair graph +ADDITIONAL_CONFIG = [{ + "additional_config": { + "torchair_graph_config": { + "enabled": True + }, + "ascend_scheduler_config": { + "enabled": True, + } + } +}] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +@pytest.mark.parametrize("enfore_eager", [True, False]) +@pytest.mark.parametrize("additional_config", ADDITIONAL_CONFIG) +def test_pangu_model(model: str, dtype: str, max_tokens: int, + enfore_eager: bool, additional_config: dict) -> None: + if enfore_eager: + additional_config = {} + example_prompts = [ + "Hello, my name is", + "The future of AI is", + ] + + with VllmRunner(model, + tensor_parallel_size=4, + dtype=dtype, + max_model_len=1024, + enforce_eager=True, + enable_expert_parallel=True, + additional_config=additional_config, + distributed_executor_backend="mp") as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/common.sh b/tests/e2e/common.sh new file mode 100644 index 0000000..3c61524 --- /dev/null +++ b/tests/e2e/common.sh @@ -0,0 +1,74 @@ +# bash fonts colors +cyan='\e[96m' +yellow='\e[33m' +red='\e[31m' +none='\e[0m' + +_cyan() { echo -e "${cyan}$*${none}"; } +_yellow() { echo -e "${yellow}$*${none}"; } +_red() { echo -e "${red}$*${none}"; } + +_info() { _cyan "Info: $*"; } +_warn() { _yellow "Warn: $*"; } +_err() { _red "Error: $*" && exit 1; } + +CURL_TIMEOUT=1 +CURL_COOLDOWN=5 +CURL_MAX_TRIES=180 + +function wait_url_ready() { + local serve_name="$1" + local url="$2" + i=0 + while true; do + _info "===> Waiting for ${serve_name} to be ready...${i}s" + i=$((i + CURL_COOLDOWN)) + set +e + curl --silent --max-time "$CURL_TIMEOUT" "${url}" >/dev/null + result=$? + set -e + if [ "$result" -eq 0 ]; then + break + fi + if [ "$i" -gt "$CURL_MAX_TRIES" ]; then + _info "===> ${CURL_MAX_TRIES}s exceeded waiting for ${serve_name} to be ready" + return 1 + fi + sleep "$CURL_COOLDOWN" + done + _info "===> ${serve_name} is ready." +} + +function wait_for_exit() { + local VLLM_PID="$1" + while kill -0 "$VLLM_PID"; do + _info "===> Wait for ${VLLM_PID} to exit." + sleep 1 + done + _info "===> Process ${VLLM_PID} has exited." +} + +VENV_PATH=/tmp/vllm_venv + +function clean_venv() { + if [[ -n "$VENV_PATH" && -d "$VENV_PATH" ]]; then + _info "Cleaning up default virtual env path: ${VENV_PATH}" + deactivate || true + rm -rf "$VENV_PATH" + fi +} + +function create_vllm_venv() { + # make a clean env path + clean_venv + _info "Creating vllm virtual environment at ${VENV_PATH}" + python3 -m venv ${VENV_PATH} + source ${VENV_PATH}/bin/activate +} + +function get_version() { + local VERSION_NAME="$1" + python3 "${SCRIPT_DIR}/../../docs/source/conf.py" | jq .${VERSION_NAME} | tr -d '"' +} + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 0000000..430153a --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,431 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# + +import contextlib +import gc +import os +from typing import Any, List, Optional, Tuple, TypeVar, Union + +import numpy as np +import pytest +import torch +from modelscope import snapshot_download # type: ignore[import-untyped] +from PIL import Image +from torch import nn +from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, + BatchEncoding, BatchFeature) +from transformers.models.auto.auto_factory import _BaseAutoModelClass +from vllm import LLM, SamplingParams +from vllm.config import TaskOption, _get_and_verify_dtype +from vllm.inputs import TextPrompt +from vllm.outputs import RequestOutput +from vllm.transformers_utils.utils import maybe_model_redirect + +from tests.e2e.model_utils import (TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs) +from vllm_ascend.ascend_config import clear_ascend_config +# TODO: remove this part after the patch merged into vllm, if +# we not explicitly patch here, some of them might be effectiveless +# in pytest scenario +from vllm_ascend.utils import adapt_patch # noqa E402 + +adapt_patch(True) +adapt_patch(False) + +from vllm.distributed.parallel_state import ( # noqa E402 + destroy_distributed_environment, destroy_model_parallel) + +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_M = TypeVar("_M") + +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] + +_TEST_DIR = os.path.dirname(__file__) + + +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +class VllmRunner: + + def __init__( + self, + model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + tokenizer_mode: str = "auto", + # Use smaller max model length, otherwise bigger model cannot run due + # to kv cache size limit. + max_model_len: int = 1024, + dtype: str = "auto", + disable_log_stats: bool = True, + tensor_parallel_size: int = 1, + block_size: int = 16, + enable_chunked_prefill: bool = False, + swap_space: int = 4, + enforce_eager: Optional[bool] = False, + quantization: Optional[str] = None, + **kwargs, + ) -> None: + self.model = LLM( + model=model_name, + task=task, + tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=True, + dtype=dtype, + swap_space=swap_space, + enforce_eager=enforce_eager, + disable_log_stats=disable_log_stats, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + block_size=block_size, + enable_chunked_prefill=enable_chunked_prefill, + quantization=quantization, + **kwargs, + ) + + def get_inputs( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[TextPrompt]: + if images is not None: + assert len(prompts) == len(images) + + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + + inputs = [TextPrompt(prompt=prompt) for prompt in prompts] + if images is not None: + for i, image in enumerate(images): + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} + + if videos is not None: + for i, video in enumerate(videos): + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} + + if audios is not None: + for i, audio in enumerate(audios): + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} + + return inputs + + def generate( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[List[int]], List[str]]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + outputs: List[Tuple[List[List[int]], List[str]]] = [] + for req_output in req_outputs: + prompt_str = req_output.prompt + prompt_ids = req_output.prompt_token_ids + req_sample_output_ids: List[List[int]] = [] + req_sample_output_strs: List[str] = [] + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + req_sample_output_ids.append(prompt_ids + output_ids) + req_sample_output_strs.append(prompt_str + output_str) + outputs.append((req_sample_output_ids, req_sample_output_strs)) + return outputs + + @staticmethod + def _final_steps_generate_w_logprobs( + req_outputs: List[RequestOutput], + ) -> List[TokensTextLogprobsPromptLogprobs]: + outputs: List[TokensTextLogprobsPromptLogprobs] = [] + for req_output in req_outputs: + assert len(req_output.outputs) > 0 + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + output_logprobs = sample.logprobs + outputs.append((output_ids, output_str, output_logprobs, + req_output.prompt_logprobs)) + return outputs + + def generate_w_logprobs( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) + + def generate_greedy( + self, + prompts: List[str], + max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[int], str]]: + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + outputs = self.generate(prompts, + greedy_params, + images=images, + videos=videos, + audios=audios) + return [(output_ids[0], output_str[0]) + for output_ids, output_str in outputs] + + def generate_greedy_logprobs( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + stop_token_ids: Optional[List[int]] = None, + stop: Optional[List[str]] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=num_prompt_logprobs, + stop_token_ids=stop_token_ids, + stop=stop) + + return self.generate_w_logprobs(prompts, + greedy_logprobs_params, + images=images, + audios=audios, + videos=videos) + + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.embed(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + del self.model + clear_ascend_config() + cleanup_dist_env_and_memory() + + +class HfRunner: + + def get_default_device(self): + from vllm.platforms import current_platform + + return ("cpu" + if current_platform.is_cpu() else current_platform.device_type) + + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + + if device is None: + device = self.device + + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} + + if hasattr(x, "device") and x.device.type == device: + return x + + return x.to(device) + + def __init__( + self, + model_name: str, + dtype: str = "auto", + *, + model_kwargs: Optional[dict[str, Any]] = None, + trust_remote_code: bool = True, + is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, + skip_tokenizer_init: bool = False, + auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM, + ) -> None: + model_name = maybe_model_redirect(model_name) + self.model_name = model_name + + self.config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=trust_remote_code, + ) + self.device = self.get_default_device() + self.dtype = torch_dtype = _get_and_verify_dtype( + self.model_name, + self.config, + dtype=dtype, + is_pooling_model=is_sentence_transformer or is_cross_encoder, + ) + + model_kwargs = model_kwargs if model_kwargs is not None else {} + model_kwargs.setdefault("torch_dtype", torch_dtype) + + if is_sentence_transformer: + # Lazy init required for AMD CI + from sentence_transformers import SentenceTransformer + + self.model = SentenceTransformer( + model_name, + device=self.device, + model_kwargs=model_kwargs, + trust_remote_code=trust_remote_code, + ) + elif is_cross_encoder: + # Lazy init required for AMD CI + from sentence_transformers import CrossEncoder + + self.model = CrossEncoder( + model_name, + device=self.device, + automodel_args=model_kwargs, + trust_remote_code=trust_remote_code, + ) + else: + model = auto_cls.from_pretrained( + model_name, + trust_remote_code=trust_remote_code, + **model_kwargs, + ) + + # in case some unquantized custom models are not in same dtype + if (getattr(model, "quantization_method", None) is None + and any(p.dtype != self.dtype + for p in model.parameters())): + model = model.to(dtype=self.dtype) + + if (getattr(model, "quantization_method", None) != "bitsandbytes" + and len({p.device + for p in model.parameters()}) < 2): + model = model.to(device=self.device) + + self.model = model + + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + ) + + # don't put this import at the top level + # it will call torch.cuda.device_count() + from transformers import AutoProcessor # noqa: F401 + self.processor = AutoProcessor.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=trust_remote_code, + ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer + + def encode(self, prompts: list[str], *args, + **kwargs) -> list[list[torch.Tensor]]: + return self.model.encode(prompts, *args, **kwargs) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + del self.model + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="session") +def ilama_lora_files(): + return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider") + + +def qwen_prompt(questions: List[str]) -> List[str]: + placeholder = "<|image_pad|>" + return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions] + + +PROMPT_TEMPLATES = { + "qwen2.5vl": qwen_prompt, +} + + +@pytest.fixture(params=list(PROMPT_TEMPLATES.keys())) +def prompt_template(request): + return PROMPT_TEMPLATES[request.param] diff --git a/tests/e2e/doctests/001-quickstart-test.sh b/tests/e2e/doctests/001-quickstart-test.sh new file mode 100755 index 0000000..6490908 --- /dev/null +++ b/tests/e2e/doctests/001-quickstart-test.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +function install_system_packages() { + if command -v apt-get >/dev/null; then + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + apt-get update -y && apt install -y curl + elif command -v yum >/dev/null; then + yum update -y && yum install -y curl + else + echo "Unknown package manager. Please install gcc, g++, numactl-devel, git, curl, and jq manually." + fi +} + +function simple_test() { + # Do real import test + python3 -c "import vllm; print(vllm.__version__)" +} + +function quickstart_offline_test() { + # Do real script test + python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" +} + +function quickstart_online_test() { + install_system_packages + vllm serve Qwen/Qwen2.5-0.5B-Instruct & + wait_url_ready "vllm serve" "localhost:8000/v1/models" + # Do real curl test + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-0.5B-Instruct", + "prompt": "Beijing is a", + "max_tokens": 5, + "temperature": 0 + }' | python3 -m json.tool + VLLM_PID=$(pgrep -f "vllm serve") + _info "===> Try kill -2 ${VLLM_PID} to exit." + kill -2 "$VLLM_PID" + wait_for_exit "$VLLM_PID" +} + +_info "====> Start simple_test" +simple_test +_info "====> Start quickstart_offline_test" +quickstart_offline_test +_info "====> Start quickstart_online_test" +quickstart_online_test diff --git a/tests/e2e/doctests/002-pip-binary-installation-test.sh b/tests/e2e/doctests/002-pip-binary-installation-test.sh new file mode 100644 index 0000000..a763cef --- /dev/null +++ b/tests/e2e/doctests/002-pip-binary-installation-test.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +trap clean_venv EXIT + +function install_system_packages() { + if command -v apt-get >/dev/null; then + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + apt-get update -y && apt-get install -y gcc g++ cmake libnuma-dev wget git curl jq + elif command -v yum >/dev/null; then + yum update -y && yum install -y gcc g++ cmake numactl-devel wget git curl jq + else + echo "Unknown package manager. Please install curl manually." + fi +} + +function config_pip_mirror() { + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +} + +function install_binary_test() { + + install_system_packages + config_pip_mirror + create_vllm_venv + + PIP_VLLM_VERSION=$(get_version pip_vllm_version) + PIP_VLLM_ASCEND_VERSION=$(get_version pip_vllm_ascend_version) + _info "====> Install vllm==${PIP_VLLM_VERSION} and vllm-ascend ${PIP_VLLM_ASCEND_VERSION}" + + # Setup extra-index-url for x86 & torch_npu dev version + pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi" + + pip install vllm=="$(get_version pip_vllm_version)" + pip install vllm-ascend=="$(get_version pip_vllm_ascend_version)" + + pip list | grep vllm + + # Verify the installation + _info "====> Run offline example test" + pip install modelscope + python3 "${SCRIPT_DIR}/../../examples/offline_inference_npu.py" + +} + +_info "====> Start install_binary_test" +install_binary_test diff --git a/tests/e2e/model_utils.py b/tests/e2e/model_utils.py new file mode 100644 index 0000000..1a3ea5b --- /dev/null +++ b/tests/e2e/model_utils.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py +# + +from typing import Dict, List, Optional, Sequence, Tuple, Union + +from vllm.sequence import PromptLogprobs, SampleLogprobs + +TokensText = Tuple[List[int], str] + + +def check_outputs_equal( + *, + outputs_0_lst: Sequence[TokensText], + outputs_1_lst: Sequence[TokensText], + name_0: str, + name_1: str, +): + """ + Compare the two sequences generated by different models, + which should be equal. + """ + assert len(outputs_0_lst) == len(outputs_1_lst) + + for prompt_idx, (outputs_0, + outputs_1) in enumerate(zip(outputs_0_lst, + outputs_1_lst)): + output_ids_0, output_str_0 = outputs_0 + output_ids_1, output_str_1 = outputs_1 + + # The text and token outputs should exactly match + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") + + assert output_str_0 == output_str_1, fail_msg + assert output_ids_0 == output_ids_1, fail_msg + + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * List of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. +TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, + float]], + SampleLogprobs]]] + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * Optional list of top sample logprobs for each sampled token +# * Optional list of top prompt logprobs for each prompt token +# +# Allows prompt logprobs to be requested. +TokensTextLogprobsPromptLogprobs = Tuple[ + List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], + Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] diff --git a/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml new file mode 100644 index 0000000..7df0544 --- /dev/null +++ b/tests/e2e/models/configs/DeepSeek-V2-Lite.yaml @@ -0,0 +1,13 @@ +model_name: "deepseek-ai/DeepSeek-V2-Lite" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.375 + - name: "exact_match,flexible-extract" + value: 0.375 +tensor_parallel_size: 2 +apply_chat_template: False +fewshot_as_multiturn: False +trust_remote_code: True +enforce_eager: True diff --git a/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 0000000..eb7196a --- /dev/null +++ b/tests/e2e/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen2.5-VL-7B-Instruct" +model: "vllm-vlm" +tasks: +- name: "mmmu_val" + metrics: + - name: "acc,none" + value: 0.51 +max_model_len: 8192 \ No newline at end of file diff --git a/tests/e2e/models/configs/Qwen3-30B-A3B.yaml b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml new file mode 100644 index 0000000..be1bbb0 --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-30B-A3B.yaml @@ -0,0 +1,18 @@ +model_name: "Qwen/Qwen3-30B-A3B" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.89 + - name: "exact_match,flexible-extract" + value: 0.85 +- name: "ceval-valid" + metrics: + - name: "acc,none" + value: 0.84 +num_fewshot: 5 +gpu_memory_utilization: 0.6 +enable_expert_parallel: True +tensor_parallel_size: 2 +apply_chat_template: False +fewshot_as_multiturn: False \ No newline at end of file diff --git a/tests/e2e/models/configs/Qwen3-8B-Base.yaml b/tests/e2e/models/configs/Qwen3-8B-Base.yaml new file mode 100644 index 0000000..e60cc9a --- /dev/null +++ b/tests/e2e/models/configs/Qwen3-8B-Base.yaml @@ -0,0 +1,13 @@ +model_name: "Qwen/Qwen3-8B-Base" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.82 + - name: "exact_match,flexible-extract" + value: 0.83 +- name: "ceval-valid" + metrics: + - name: "acc,none" + value: 0.82 +num_fewshot: 5 diff --git a/tests/e2e/models/configs/accuracy.txt b/tests/e2e/models/configs/accuracy.txt new file mode 100644 index 0000000..e29ff1a --- /dev/null +++ b/tests/e2e/models/configs/accuracy.txt @@ -0,0 +1,3 @@ +Qwen3-8B-Base.yaml +Qwen2.5-VL-7B-Instruct.yaml +Qwen3-30B-A3B.yaml \ No newline at end of file diff --git a/tests/e2e/models/conftest.py b/tests/e2e/models/conftest.py new file mode 100644 index 0000000..a75659f --- /dev/null +++ b/tests/e2e/models/conftest.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from pathlib import Path + +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--config-list-file", + action="store", + default=None, + help="Path to the file listing model config YAMLs (one per line)", + ) + parser.addoption( + "--tp-size", + action="store", + default="1", + help="Tensor parallel size to use for evaluation", + ) + parser.addoption( + "--config", + action="store", + default="./tests/e2e/models/configs/Qwen3-8B-Base.yaml", + help="Path to the model config YAML file", + ) + parser.addoption( + "--report-dir", + action="store", + default="./benchmarks/accuracy", + help="Directory to store report files", + ) + + +@pytest.fixture(scope="session") +def config_list_file(pytestconfig, config_dir): + rel_path = pytestconfig.getoption("--config-list-file") + return config_dir / rel_path + + +@pytest.fixture(scope="session") +def tp_size(pytestconfig): + return pytestconfig.getoption("--tp-size") + + +@pytest.fixture(scope="session") +def config(pytestconfig): + return pytestconfig.getoption("--config") + + +@pytest.fixture(scope="session") +def report_dir(pytestconfig): + return pytestconfig.getoption("report_dir") + + +def pytest_generate_tests(metafunc): + if "config_filename" in metafunc.fixturenames: + + if metafunc.config.getoption("--config-list-file"): + rel_path = metafunc.config.getoption("--config-list-file") + config_list_file = Path(rel_path).resolve() + config_dir = config_list_file.parent + with open(config_list_file, encoding="utf-8") as f: + configs = [ + config_dir / line.strip() for line in f + if line.strip() and not line.startswith("#") + ] + metafunc.parametrize("config_filename", configs) + else: + single_config = metafunc.config.getoption("--config") + config_path = Path(single_config).resolve() + metafunc.parametrize("config_filename", [config_path]) diff --git a/tests/e2e/models/report_template.md b/tests/e2e/models/report_template.md new file mode 100644 index 0000000..8402545 --- /dev/null +++ b/tests/e2e/models/report_template.md @@ -0,0 +1,21 @@ +# {{ model_name }} + +- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }})) +- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }} +- **Hardware Environment**: Atlas A2 Series +- **Parallel mode**: {{ parallel_mode }} +- **Execution mode**: ACLGraph + +**Command**: + +```bash +export MODEL_ARGS={{ model_args }} +lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \ +{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}} +``` + +| Task | Metric | Value | Stderr | +|-----------------------|-------------|----------:|-------:| +{% for row in rows -%} +| {{ row.task }} | {{ row.metric }} | {{ row.value }} | ± {{ "%.4f" | format(row.stderr | float) }} | +{% endfor %} diff --git a/tests/e2e/models/test_lm_eval_correctness.py b/tests/e2e/models/test_lm_eval_correctness.py new file mode 100644 index 0000000..18768e1 --- /dev/null +++ b/tests/e2e/models/test_lm_eval_correctness.py @@ -0,0 +1,153 @@ +import os +from dataclasses import dataclass + +import lm_eval +import numpy as np +import pytest +import yaml +from jinja2 import Environment, FileSystemLoader + +RTOL = 0.03 +TEST_DIR = os.path.dirname(__file__) + + +@dataclass +class EnvConfig: + vllm_version: str + vllm_commit: str + vllm_ascend_version: str + vllm_ascend_commit: str + cann_version: str + torch_version: str + torch_npu_version: str + + +@pytest.fixture +def env_config() -> EnvConfig: + return EnvConfig(vllm_version=os.getenv('VLLM_VERSION', 'unknown'), + vllm_commit=os.getenv('VLLM_COMMIT', 'unknown'), + vllm_ascend_version=os.getenv('VLLM_ASCEND_VERSION', + 'unknown'), + vllm_ascend_commit=os.getenv('VLLM_ASCEND_COMMIT', + 'unknown'), + cann_version=os.getenv('CANN_VERSION', 'unknown'), + torch_version=os.getenv('TORCH_VERSION', 'unknown'), + torch_npu_version=os.getenv('TORCH_NPU_VERSION', + 'unknown')) + + +def build_model_args(eval_config, tp_size): + trust_remote_code = eval_config.get("trust_remote_code", False) + max_model_len = eval_config.get("max_model_len", 4096) + model_args = { + "pretrained": eval_config["model_name"], + "tensor_parallel_size": tp_size, + "dtype": "auto", + "trust_remote_code": trust_remote_code, + "max_model_len": max_model_len, + } + for s in [ + "max_images", "gpu_memory_utilization", "enable_expert_parallel", + "tensor_parallel_size", "enforce_eager" + ]: + val = eval_config.get(s, None) + if val is not None: + model_args[s] = val + + print("Model Parameters:") + print(model_args) + + return model_args + + +def generate_report(tp_size, eval_config, report_data, report_dir, env_config): + env = Environment(loader=FileSystemLoader(TEST_DIR)) + template = env.get_template("report_template.md") + model_args = build_model_args(eval_config, tp_size) + + parallel_mode = f"TP{model_args.get('tensor_parallel_size', 1)}" + if model_args.get('enable_expert_parallel', False): + parallel_mode += " + EP" + + report_content = template.render( + vllm_version=env_config.vllm_version, + vllm_commit=env_config.vllm_commit, + vllm_ascend_version=env_config.vllm_ascend_version, + vllm_ascend_commit=env_config.vllm_ascend_commit, + cann_version=env_config.cann_version, + torch_version=env_config.torch_version, + torch_npu_version=env_config.torch_npu_version, + model_name=eval_config["model_name"], + model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'", + model_type=eval_config.get("model", "vllm"), + datasets=",".join([task["name"] for task in eval_config["tasks"]]), + apply_chat_template=eval_config.get("apply_chat_template", True), + fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True), + limit=eval_config.get("limit", "N/A"), + batch_size="auto", + num_fewshot=eval_config.get("num_fewshot", "N/A"), + rows=report_data["rows"], + parallel_mode=parallel_mode) + + report_output = os.path.join( + report_dir, f"{os.path.basename(eval_config['model_name'])}.md") + os.makedirs(os.path.dirname(report_output), exist_ok=True) + with open(report_output, 'w', encoding='utf-8') as f: + f.write(report_content) + + +def test_lm_eval_correctness_param(config_filename, tp_size, report_dir, + env_config): + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) + model_args = build_model_args(eval_config, tp_size) + success = True + report_data: dict[str, list[dict]] = {"rows": []} + + eval_params = { + "model": eval_config.get("model", "vllm"), + "model_args": model_args, + "tasks": [task["name"] for task in eval_config["tasks"]], + "apply_chat_template": eval_config.get("apply_chat_template", True), + "fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True), + "limit": eval_config.get("limit", None), + "batch_size": "auto", + } + for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]: + val = eval_config.get(s, None) + if val is not None: + eval_params[s] = val + + print("Eval Parameters:") + print(eval_params) + + results = lm_eval.simple_evaluate(**eval_params) + + for task in eval_config["tasks"]: + task_name = task["name"] + task_result = results["results"][task_name] + for metric in task["metrics"]: + metric_name = metric["name"] + ground_truth = metric["value"] + measured_value = round(task_result[metric_name], 4) + task_success = bool( + np.isclose(ground_truth, measured_value, rtol=RTOL)) + success = success and task_success + + print(f"{task_name} | {metric_name}: " + f"ground_truth={ground_truth} | measured={measured_value} | " + f"success={'✅' if task_success else '❌'}") + + report_data["rows"].append({ + "task": + task_name, + "metric": + metric_name, + "value": + f"✅{measured_value}" if success else f"❌{measured_value}", + "stderr": + task_result[ + metric_name.replace(',', '_stderr,') if metric_name == + "acc,none" else metric_name.replace(',', '_stderr,')] + }) + generate_report(tp_size, eval_config, report_data, report_dir, env_config) + assert success diff --git a/tests/e2e/multicard/test_data_parallel.py b/tests/e2e/multicard/test_data_parallel.py new file mode 100644 index 0000000..11b7681 --- /dev/null +++ b/tests/e2e/multicard/test_data_parallel.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/multicard/test_data_parallel.py`. +""" + +import os +import subprocess +import sys +from unittest.mock import patch + +import pytest + +MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen3-30B-A3B"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1"}) +def test_data_parallel_inference(model, max_tokens): + script = "examples/offline_data_parallel.py" + + env = os.environ.copy() + + cmd = [ + sys.executable, + script, + "--model", + model, + "--dp-size", + "2", + "--tp-size", + "1", + "--node-size", + "1", + "--node-rank", + "0", + "--trust-remote-code", + "--enforce-eager", + ] + if model == "Qwen/Qwen3-30B-A3B": + cmd.append("--enable-expert-parallel") + + print(f"Running subprocess: {' '.join(cmd)}") + proc = subprocess.run(cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=600) + output = proc.stdout.decode() + + print(output) + + assert "DP rank 0 needs to process" in output + assert "DP rank 1 needs to process" in output + assert "Generated text:" in output + assert proc.returncode == 0 diff --git a/tests/e2e/multicard/test_expert_parallel.py b/tests/e2e/multicard/test_expert_parallel.py new file mode 100644 index 0000000..e956ed6 --- /dev/null +++ b/tests/e2e/multicard/test_expert_parallel.py @@ -0,0 +1,32 @@ +import pytest + +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal + + +@pytest.mark.parametrize("model_name", ["deepseek-ai/DeepSeek-V2-Lite-Chat"]) +def test_e2e_ep_correctness(model_name): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + max_tokens = 5 + + with VllmRunner(model_name, tensor_parallel_size=2, + enforce_eager=True) as vllm_model: + tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) + + with VllmRunner(model_name, + tensor_parallel_size=2, + enable_expert_parallel=True, + enforce_eager=True) as vllm_model: + ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=ep_output, + outputs_1_lst=tp_output, + name_0="ep_output", + name_1="tp_output", + ) diff --git a/tests/e2e/multicard/test_external_launcher.py b/tests/e2e/multicard/test_external_launcher.py new file mode 100644 index 0000000..24c66bf --- /dev/null +++ b/tests/e2e/multicard/test_external_launcher.py @@ -0,0 +1,187 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/multicard/test_external_launcher.py`. +""" + +import os +import subprocess +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest +import torch_npu + +MODELS = ["Qwen/Qwen3-0.6B"] +MOE_MODELS = ["Qwen/Qwen3-30B-A3B"] +DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10] + + +@pytest.mark.parametrize("model", MODELS) +def test_external_launcher(model): + script = Path( + __file__ + ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" + env = os.environ.copy() + # TODO: Change to 2 when ci machine has 4 cards + cmd = [ + sys.executable, + str(script), + "--model", + model, + "--tp-size", + "1", + "--node-size", + "1", + "--node-rank", + "0", + "--proc-per-node", + "2", + "--trust-remote-code", + ] + + print(f"Running subprocess: {' '.join(cmd)}") + proc = subprocess.run( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=600, + ) + output = proc.stdout.decode() + + print(output) + + assert "TP RANKS: [0]" in output + assert "TP RANKS: [1]" in output + assert "Generated text:" in output + assert proc.returncode == 0 + + +@pytest.mark.parametrize("model", MOE_MODELS) +def test_moe_external_launcher(model): + script = Path( + __file__ + ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" + env = os.environ.copy() + # TODO: Change to 2 when ci machine has 4 cards + cmd = [ + sys.executable, + str(script), "--model", model, "--tp-size", "2", "--node-size", "1", + "--node-rank", "0", "--proc-per-node", "2", "--trust-remote-code", + "--enable-expert-parallel" + ] + + print(f"Running subprocess: {' '.join(cmd)}") + proc = subprocess.run( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=600, + ) + output = proc.stdout.decode() + + print(output) + + assert "TP RANKS: [0, 1]" in output + assert "Generated text:" in output + assert proc.returncode == 0 + + +def test_external_launcher_and_sleepmode(): + script = Path( + __file__ + ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" + env = os.environ.copy() + # TODO: Change to 2 when ci machine has 4 cards + cmd = [ + sys.executable, + str(script), + "--model", + "Qwen/Qwen3-8B", + "--tp-size", + "1", + "--node-size", + "1", + "--node-rank", + "0", + "--proc-per-node", + "2", + "--trust-remote-code", + "--enable-sleep-mode", + "--temperature", + "0", + "--model-weight-gib", + "16", + ] + + print(f"Running subprocess: {' '.join(cmd)}") + proc = subprocess.run( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=300, + ) + output = proc.stdout.decode() + + print(output) + + assert "TP RANKS: [0]" in output + assert "TP RANKS: [1]" in output + assert "Generated text:" in output + assert "Sleep and wake up successfully!!" in output + assert proc.returncode == 0 + + +@pytest.mark.skipif( + DEVICE_NAME != "Ascend910B", + reason="This test is only for Ascend910B devices.", +) +@pytest.mark.parametrize("model", MODELS) +@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": "1"}) +def test_mm_allreduce(model): + script = Path( + __file__ + ).parent.parent.parent.parent / "examples" / "offline_external_launcher.py" + env = os.environ.copy() + cmd = [ + sys.executable, + str(script), + "--model", + model, + "--trust-remote-code", + ] + + print(f"Running subprocess: {' '.join(cmd)}") + proc = subprocess.run( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + timeout=600, + ) + + output = proc.stdout.decode() + print(output) + + assert "Generated text:" in output + assert proc.returncode == 0 diff --git a/tests/e2e/multicard/test_fused_moe_allgather_ep.py b/tests/e2e/multicard/test_fused_moe_allgather_ep.py new file mode 100644 index 0000000..9335e19 --- /dev/null +++ b/tests/e2e/multicard/test_fused_moe_allgather_ep.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Execute the inference of fused_moe_allgather_ep and fused_moe_alltoall_ep. + +Run 'pytest tests/multicard/test_fused_moe_allgather_ep.py'. +""" + +import os +from unittest.mock import patch + +import pytest +from modelscope import snapshot_download # type: ignore +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + + +@pytest.mark.skipif( + True, + reason= + "Current disaggregated pd implementation may cause memory pulse, which will cause this test OOM, skip this test until the ringmla is ready " +) +@patch.dict( + os.environ, { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "TASK_QUEUE_ENABLE": "1", + "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": "1" + }) +def test_generate_with_allgather(): + example_prompts = ["Hello, my name is"] + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + + with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), + tensor_parallel_size=2, + max_model_len=1024, + dtype="auto", + enable_expert_parallel=True, + additional_config={ + "ascend_scheduler_config": { + "enabled": True, + "chunked_prefill_enabled": False, + }, + }) as vllm_model: + vllm_model.generate(example_prompts, sampling_params) + + +@pytest.mark.skipif( + True, + reason= + "Current disaggregated pd implementation may cause memory pulse, which will cause this test OOM, skip this test until the ringmla is ready " +) +@patch.dict(os.environ, { + "VLLM_WORKER_MULTIPROC_METHOD": "spawn", + "TASK_QUEUE_ENABLE": "1" +}) +def test_generate_with_alltoall(): + example_prompts = ["Hello, my name is"] + sampling_params = SamplingParams(max_tokens=100, temperature=0.0) + + with VllmRunner(snapshot_download("vllm-ascend/DeepSeek-V3-Pruning"), + tensor_parallel_size=2, + max_model_len=1024, + dtype="auto", + enable_expert_parallel=True, + additional_config={ + "ascend_scheduler_config": { + "enabled": True, + "chunked_prefill_enabled": False, + }, + }) as vllm_model: + vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_ilama_lora_tp2.py b/tests/e2e/multicard/test_ilama_lora_tp2.py new file mode 100644 index 0000000..9fca8ae --- /dev/null +++ b/tests/e2e/multicard/test_ilama_lora_tp2.py @@ -0,0 +1,23 @@ +import pytest +from modelscope import snapshot_download # type: ignore + +from tests.e2e.conftest import VllmRunner +from tests.e2e.singlecard.test_ilama_lora import (EXPECTED_LORA_OUTPUT, + MODEL_PATH, do_sample) + + +@pytest.mark.parametrize("distributed_executor_backend", ["mp"]) +def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files): + with VllmRunner(snapshot_download(MODEL_PATH), + enable_lora=True, + max_loras=4, + dtype="half", + max_model_len=1024, + max_num_seqs=16, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2) + + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/e2e/multicard/test_offline_inference_distributed.py b/tests/e2e/multicard/test_offline_inference_distributed.py new file mode 100644 index 0000000..a90c864 --- /dev/null +++ b/tests/e2e/multicard/test_offline_inference_distributed.py @@ -0,0 +1,152 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/test_offline_inference.py`. +""" +import os +from unittest.mock import patch + +from modelscope import snapshot_download # type: ignore +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +def test_models_distributed_QwQ(): + example_prompts = [ + "Hello, my name is", + ] + dtype = "half" + max_tokens = 5 + with VllmRunner( + "Qwen/QwQ-32B", + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend="mp", + enforce_eager=True, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_DeepSeek_multistream_moe(): + example_prompts = [ + "Hello, my name is", + ] + dtype = "half" + max_tokens = 5 + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype=dtype, + tensor_parallel_size=2, + distributed_executor_backend="mp", + additional_config={ + "torchair_graph_config": { + "enabled": True, + "enable_multistream_moe": True, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + }, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner( + snapshot_download("vllm-ascend/Qwen3-8B-W8A8"), + max_model_len=8192, + dtype="auto", + tensor_parallel_size=2, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_W4A8DYNAMIC(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + + with VllmRunner( + snapshot_download("vllm-ascend/Qwen3-8B-W4A8"), + max_model_len=8192, + dtype="auto", + tensor_parallel_size=2, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"}) +def test_models_distributed_DeepSeek_W4A8DYNAMIC(): + prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"), + dtype="auto", + tensor_parallel_size=2, + quantization="ascend", + enforce_eager=True, + enable_expert_parallel=True, + additional_config={ + "torchair_graph_config": { + "enabled": False, + }, + "ascend_scheduler_config": { + "enabled": True, + } + }, + ) as vllm_model: + vllm_model.generate_greedy(prompts, max_tokens) + + +def test_sp_for_qwen3_moe() -> None: + example_prompts = [ + "Hello, my name is", + ] + sampling_params = SamplingParams(max_tokens=5, + temperature=0.0, + top_k=50, + top_p=0.9) + + with VllmRunner(snapshot_download("Qwen/Qwen3-30B-A3B"), + dtype="auto", + tensor_parallel_size=2, + distributed_executor_backend="mp", + compilation_config={ + "pass_config": { + "enable_sequence_parallelism": True + } + }, + enable_expert_parallel=True, + enforce_eager=True) as vllm_model: + vllm_model.generate(example_prompts, sampling_params) diff --git a/tests/e2e/multicard/test_pipeline_parallel.py b/tests/e2e/multicard/test_pipeline_parallel.py new file mode 100644 index 0000000..03774db --- /dev/null +++ b/tests/e2e/multicard/test_pipeline_parallel.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import pytest + +from tests.e2e.conftest import VllmRunner + +MODELS = [ + "Qwen/Qwen3-0.6B", +] + +TENSOR_PARALLELS = [1] +PIPELINE_PARALLELS = [2] +DIST_EXECUTOR_BACKEND = ["mp", "ray"] + +prompts = [ + "Hello, my name is", + "The future of AI is", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) +@pytest.mark.parametrize("pp_size", PIPELINE_PARALLELS) +@pytest.mark.parametrize("distributed_executor_backend", DIST_EXECUTOR_BACKEND) +def test_models(model: str, tp_size: int, pp_size: int, + distributed_executor_backend: str) -> None: + with VllmRunner(model, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + distributed_executor_backend=distributed_executor_backend, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy(prompts, 64) diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py new file mode 100644 index 0000000..73d0d2c --- /dev/null +++ b/tests/e2e/multicard/test_prefix_caching.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler.""" + +import pytest + +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal + +MODELS = [ + # for MHA + "Qwen/Qwen3-8B-Base", + # for MLA + "deepseek-ai/DeepSeek-V2-Lite-Chat" +] + +# A prompt containing a large markdown table. The table is randomly generated by GPT-4. +LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """ +| ID | Name | Age | Occupation | Country | Email | Phone Number | Address | +|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------| +| 1 | John Doe | 29 | Engineer | USA | john.doe@example.com | 555-1234 | 123 Elm St, Springfield, IL | +| 2 | Jane Smith | 34 | Doctor | Canada | jane.smith@example.com | 555-5678 | 456 Oak St, Toronto, ON | +| 3 | Alice Johnson | 27 | Teacher | UK | alice.j@example.com | 555-8765 | 789 Pine St, London, UK | +| 4 | Bob Brown | 45 | Artist | Australia | bob.b@example.com | 555-4321 | 321 Maple St, Sydney, NSW | +| 5 | Carol White | 31 | Scientist | New Zealand | carol.w@example.com | 555-6789 | 654 Birch St, Wellington, NZ | +| 6 | Dave Green | 28 | Lawyer | Ireland | dave.g@example.com | 555-3456 | 987 Cedar St, Dublin, IE | +| 7 | Emma Black | 40 | Musician | USA | emma.b@example.com | 555-1111 | 246 Ash St, New York, NY | +| 8 | Frank Blue | 37 | Chef | Canada | frank.b@example.com | 555-2222 | 135 Spruce St, Vancouver, BC | +| 9 | Grace Yellow | 50 | Engineer | UK | grace.y@example.com | 555-3333 | 864 Fir St, Manchester, UK | +| 10 | Henry Violet | 32 | Artist | Australia | henry.v@example.com | 555-4444 | 753 Willow St, Melbourne, VIC| +| 11 | Irene Orange | 26 | Scientist | New Zealand | irene.o@example.com | 555-5555 | 912 Poplar St, Auckland, NZ | +| 12 | Jack Indigo | 38 | Teacher | Ireland | jack.i@example.com | 555-6666 | 159 Elm St, Cork, IE | +| 13 | Karen Red | 41 | Lawyer | USA | karen.r@example.com | 555-7777 | 357 Cedar St, Boston, MA | +| 14 | Leo Brown | 30 | Chef | Canada | leo.b@example.com | 555-8888 | 246 Oak St, Calgary, AB | +| 15 | Mia Green | 33 | Musician | UK | mia.g@example.com | 555-9999 | 975 Pine St, Edinburgh, UK | +| 16 | Noah Yellow | 29 | Doctor | Australia | noah.y@example.com | 555-0000 | 864 Birch St, Brisbane, QLD | +| 17 | Olivia Blue | 35 | Engineer | New Zealand | olivia.b@example.com | 555-1212 | 753 Maple St, Hamilton, NZ | +| 18 | Peter Black | 42 | Artist | Ireland | peter.b@example.com | 555-3434 | 912 Fir St, Limerick, IE | +| 19 | Quinn White | 28 | Scientist | USA | quinn.w@example.com | 555-5656 | 159 Willow St, Seattle, WA | +| 20 | Rachel Red | 31 | Teacher | Canada | rachel.r@example.com | 555-7878 | 357 Poplar St, Ottawa, ON | +| 21 | Steve Green | 44 | Lawyer | UK | steve.g@example.com | 555-9090 | 753 Elm St, Birmingham, UK | +| 22 | Tina Blue | 36 | Musician | Australia | tina.b@example.com | 555-1213 | 864 Cedar St, Perth, WA | +| 23 | Umar Black | 39 | Chef | New Zealand | umar.b@example.com | 555-3435 | 975 Spruce St, Christchurch, NZ| +| 24 | Victor Yellow | 43 | Engineer | Ireland | victor.y@example.com | 555-5657 | 246 Willow St, Galway, IE | +| 25 | Wendy Orange | 27 | Artist | USA | wendy.o@example.com | 555-7879 | 135 Elm St, Denver, CO | +| 26 | Xavier Green | 34 | Scientist | Canada | xavier.g@example.com | 555-9091 | 357 Oak St, Montreal, QC | +| 27 | Yara Red | 41 | Teacher | UK | yara.r@example.com | 555-1214 | 975 Pine St, Leeds, UK | +| 28 | Zack Blue | 30 | Lawyer | Australia | zack.b@example.com | 555-3436 | 135 Birch St, Adelaide, SA | +| 29 | Amy White | 33 | Musician | New Zealand | amy.w@example.com | 555-5658 | 159 Maple St, Wellington, NZ | +| 30 | Ben Black | 38 | Chef | Ireland | ben.b@example.com | 555-7870 | 246 Fir St, Waterford, IE | +""" + +INPUT_PROMPTS = [ + LONG_PROMPT + + "Question: what is the age of John Doe? Your answer: The age of John Doe is ", + LONG_PROMPT + + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is " +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [50]) +def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: + with VllmRunner(model, + enforce_eager=True, + max_model_len=2048, + tensor_parallel_size=2, + gpu_memory_utilization=0.7) as vllm_model: + prefix_cache_output = vllm_model.generate_greedy( + INPUT_PROMPTS, max_tokens) + + with VllmRunner(model, + enable_prefix_caching=False, + enforce_eager=True, + max_model_len=2048, + tensor_parallel_size=2, + gpu_memory_utilization=0.7) as vllm_model: + vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_output, + outputs_1_lst=prefix_cache_output, + name_0="vllm_output", + name_1="prefix_cache_output", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [50]) +def test_prefix_cache_with_ascend_scheduler(model: str, + max_tokens: int) -> None: + + with VllmRunner(model, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + }, + }, + enforce_eager=True, + max_model_len=2048, + tensor_parallel_size=2, + gpu_memory_utilization=0.7) as vllm_model: + vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens) + + with VllmRunner(model, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + 'enable_prefix_caching': True, + }, + }, + enforce_eager=True, + max_model_len=2048, + tensor_parallel_size=2, + gpu_memory_utilization=0.7) as vllm_model: + prefix_cache_output = vllm_model.generate_greedy( + INPUT_PROMPTS, max_tokens) + + with VllmRunner(model, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + 'enable_prefix_caching': True, + "enable_chunked_prefill": True, + }, + }, + enforce_eager=True, + max_model_len=2048, + tensor_parallel_size=2, + gpu_memory_utilization=0.7) as vllm_model: + chunk_prefill_prefix_cache_output = vllm_model.generate_greedy( + INPUT_PROMPTS, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_output, + outputs_1_lst=prefix_cache_output, + name_0="vllm_output", + name_1="prefix_cache_output", + ) + + check_outputs_equal( + outputs_0_lst=chunk_prefill_prefix_cache_output, + outputs_1_lst=prefix_cache_output, + name_0="chunk_prefill_prefix_cache_output", + name_1="prefix_cache_output", + ) diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py new file mode 100644 index 0000000..13e1fa3 --- /dev/null +++ b/tests/e2e/multicard/test_qwen3_moe.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/e2e/multicard/test_qwen3_moe.py`. +""" + +import os + +from modelscope import snapshot_download # type: ignore + +from tests.e2e.conftest import VllmRunner + + +def test_models_distributed_Qwen3_MOE_TP2(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=2, + distributed_executor_backend="mp", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_MOE_TP2_WITH_EP(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + tensor_parallel_size=2, + enable_expert_parallel=True, + distributed_executor_backend="mp", + enforce_eager=False, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_MOE_W8A8(): + example_prompts = [ + "Hello, my name is", + ] + max_tokens = 5 + with VllmRunner( + snapshot_download("vllm-ascend/Qwen3-30B-A3B-W8A8"), + max_model_len=8192, + tensor_parallel_size=2, + quantization="ascend", + enforce_eager=True, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH_AIV(): + os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' + example_prompts = [ + "Hello, my name is", + ] + dtype = "auto" + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + dtype=dtype, + tensor_parallel_size=2, + enforce_eager=False, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) + + +def test_models_distributed_Qwen3_MOE_TP2_WITH_ACLGRAPH(): + if 'HCCL_OP_EXPANSION_MODE' in os.environ: + del os.environ['HCCL_OP_EXPANSION_MODE'] + example_prompts = [ + "Hello, my name is", + ] + dtype = "auto" + max_tokens = 5 + with VllmRunner( + "Qwen/Qwen3-30B-A3B", + dtype=dtype, + tensor_parallel_size=2, + enforce_eager=False, + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/multicard/test_torchair_graph_mode.py b/tests/e2e/multicard/test_torchair_graph_mode.py new file mode 100644 index 0000000..1eb9d2f --- /dev/null +++ b/tests/e2e/multicard/test_torchair_graph_mode.py @@ -0,0 +1,224 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/multicard/test_torchair_graph_mode.py`. +""" +import os +from typing import Dict + +from tests.e2e.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +def _deepseek_torchair_test_fixture( + additional_config: Dict, + *, + tensor_parallel_size=2, + use_v1_schduler=False, +): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + kwargs = {} + if not use_v1_schduler: + kwargs = { + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + } + additional_config.update(**kwargs) + + with VllmRunner( + "vllm-ascend/DeepSeek-V3-Pruning", + dtype="half", + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend="mp", + additional_config=additional_config, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, 5) + + # NOTE: vllm-ascend/DeepSeek-V3-Pruning is a random weight of + # DeepSeek-V3 with 2 hidden layers, thus the golden results seems + # inaccurate. This will only change if accuracy improves with the + # official weights of DeepSeek-V3. + golden_results = [ + 'Hello, my name is下载早点向前很有่อง', + 'The president of the United States isSender)## physiological Albany', + 'The capital of France is Rocky转角 hospitalizedinterval sparked', + 'The future of AI is её asegο BIOS一扫', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") + + +def test_e2e_deepseekv3_with_torchair(): + additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + _deepseek_torchair_test_fixture(additional_config) + + +def test_e2e_deepseekv3_with_torchair_ms_mla(): + additional_config = { + "torchair_graph_config": { + "enabled": True, + "enable_multistream_mla": True, + }, + } + _deepseek_torchair_test_fixture(additional_config) + + +def test_e2e_deepseekv3_with_torchair_v1scheduler(): + additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + _deepseek_torchair_test_fixture(additional_config, use_v1_schduler=True) + + +def _pangu_torchair_test_fixture( + additional_config: Dict, + *, + tensor_parallel_size=2, +): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + # torchair is only work without chunked-prefill now + kwargs = { + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + } + additional_config.update(**kwargs) + + with VllmRunner( + "vllm-ascend/pangu-pro-moe-pruing", + dtype="half", + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend="mp", + additional_config=additional_config, + enable_expert_parallel=True, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, 5) + + # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE + # with 2 hidden layers, thus the golden results seems inaccurate. + # This will only change if accuracy changes with the official weights + # of PanguProMoE. + golden_results = [ + 'Hello, my name is Remempondeprecatedmiot忱', + 'The president of the United States is Remem下的一个 rever ceremoni Segnali', + 'The capital of France is Rememvoud administrativ Remem投', + 'The future of AI isotope Segnali Zoeken精细化 supus', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + assert golden_results[i] == vllm_output[i][1] + print(f"Generated text: {vllm_output[i][1]!r}") + + +def test_e2e_pangu_with_torchair(): + additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + _pangu_torchair_test_fixture(additional_config) + + +def _qwen_torchair_test_fixture( + model, + tp, + enable_expert_parallel, +): + # The current access control does not support 16 cards, + # so the MC2 operator in Qwen's graph mode cannot run. + # Once 16-card support is available, + # this e2e can be switched to graph mode. + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "ascend_scheduler_config": { + "enabled": True, + }, + "refresh": True, + } + + with VllmRunner( + model, + dtype="half", + tensor_parallel_size=tp, + distributed_executor_backend="mp", + enforce_eager=True, + additional_config=additional_config, + enable_expert_parallel=enable_expert_parallel, + ) as vllm_model: + # use greedy sampler to make sure the generated results are fix + vllm_output = vllm_model.generate_greedy(example_prompts, 5) + + # NOTE: vllm-ascend/pangu-pro-moe-pruing is only part of PanguProMoE + # with 2 hidden layers, thus the golden results seems inaccurate. + # This will only change if accuracy changes with the official weights + # of PanguProMoE. + golden_results = [ + 'Hello, my name is Remempondeprecatedmiot忱', + 'The president of the United States is Remem下的一个 rever ceremoni Segnali', + 'The capital of France is Rememvoud administrativ Remem投', + 'The future of AI isotope Segnali Zoeken精细化 supus', + ] + + assert len(golden_results) == len(vllm_output) + for i in range(len(vllm_output)): + print(f"Generated text: {vllm_output[i][1]!r}") + + +def test_e2e_qwen2_with_torchair(): + _qwen_torchair_test_fixture("Qwen/Qwen2.5-0.5B-Instruct", 2, False) + + +def test_e2e_qwen3_moe_with_torchair(): + _qwen_torchair_test_fixture("Qwen/Qwen3-30B-A3B", 2, True) diff --git a/tests/e2e/pd_disaggreate/run_edge_case_test.sh b/tests/e2e/pd_disaggreate/run_edge_case_test.sh new file mode 100644 index 0000000..a086df0 --- /dev/null +++ b/tests/e2e/pd_disaggreate/run_edge_case_test.sh @@ -0,0 +1,141 @@ +#!/bin/bash +export LCCL_DETERMINISTIC=1 +export HCCL_DETERMINISTIC=true +export CLOSE_MATMUL_K_SHIFT=1 +export VLLM_USE_V1=1 + +set -xe + +# Models to run +MODELS=( + "Qwen/Qwen3-0.6B-Instruct" +) + +# Find the git repository root directory +GIT_ROOT=$(git rev-parse --show-toplevel) + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT + +# Gen ranktable +RANKTABLE_PATH=${GIT_ROOT}/examples/disaggregate_prefill_v1/ranktable.json +if [ -f "$RANKTABLE_PATH" ]; then + rm "$RANKTABLE_PATH" +fi +cd ${GIT_ROOT}/examples/disaggregate_prefill_v1 +LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'` +bash gen_ranktable.sh --ips $LOCAL_HOST --network-card-name enp189s0f0 --prefill-device-cnt 1 --decode-device-cnt 1 +cd - +export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="$RANKTABLE_PATH" + +# Waits for vLLM to start. +wait_for_server() { + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/health > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +# Function to clean up previous instances +cleanup_instances() { + echo "Cleaning up any running vLLM instances..." + pkill -f "vllm serve" || true + sleep 2 +} + +# Handle to get model-specific arguments for deepseek +get_model_args() { + local model_name=$1 + local extra_args="" + + if [[ "$model_name" == *"deepseek"* ]]; then + extra_args="--trust-remote-code" + fi + + echo "$extra_args" +} + + +# Function to run tests for a specific model +run_tests_for_model() { + local model_name=$1 + echo "================================" + echo "Testing model: $model_name" + echo "================================" + + # Get model-specific arguments + local model_args=$(get_model_args "$model_name") + + # Start prefill instance + PREFILL_PORT=8001 + + BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \ + --port $PREFILL_PORT \ + --seed 1024 \ + --enforce-eager \ + --disable-log-requests \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Start decode instance + DECODE_PORT=8002 + + # Build the command with or without model-specific args + BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \ + --port $DECODE_PORT \ + --seed 1024 \ + --enforce-eager \ + --disable-log-requests \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Wait for all instances to start + echo "Waiting for prefill instance on port $PORT to start..." + wait_for_server $PREFILL_PORT + echo "Waiting for decode instance on port $PORT to start..." + wait_for_server $DECODE_PORT + + # Build the command for the proxy server with all the hosts and ports + PROXY_PORT=8192 + PROXY_CMD="python ${GIT_ROOT}/examples/disaggregate_prefill_v1/toy_proxy_server.py --port $PROXY_PORT" + PROXY_CMD+=" --prefiller-ports ${PREFILL_PORT}" + PROXY_CMD+=" --decoder-ports ${DECODE_PORT}" + # Start the proxy server + echo "Starting proxy server with command: $PROXY_CMD" + $PROXY_CMD & + + # Wait for the proxy to start + sleep 5 + + # Run lm eval for this model + echo "Running tests for $model_name" + PREFILL_PORT=$PREFILL_PORT DECODE_PORT=$DECODE_PORT PROXY_PORT=$PROXY_PORT python -m pytest -s -v ${GIT_ROOT}/tests/e2e/pd_disaggreate/test_edge_cases.py + + # Clean up before running next model + cleanup_instances + sleep 3 +} + +# Run tests for each model +for model in "${MODELS[@]}"; do + run_tests_for_model "$model" +done + +echo "All tests completed!" \ No newline at end of file diff --git a/tests/e2e/pd_disaggreate/setup_pd.sh b/tests/e2e/pd_disaggreate/setup_pd.sh new file mode 100644 index 0000000..c15f109 --- /dev/null +++ b/tests/e2e/pd_disaggreate/setup_pd.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +function run_prefill_instance() { + local model_name=$1 + local tp_size=$2 + local prefill_port=$3 + local register_port=$4 + local prefill_device_ips=$5 + local decode_device_ips=$6 + + echo "================================" + echo "Testing model: $model_name" + echo "================================" + # Start prefill instance + + KV_CONFIG=$(jq -n \ + --arg kv_connector "AscendSimpleConnector" \ + --arg kv_buffer_device "npu" \ + --arg kv_role "kv_producer" \ + --argjson kv_parallel_size 8 \ + --arg kv_port 11001 \ + --argjson prefill_device_ips "$prefill_device_ips" \ + --argjson decode_device_ips "$decode_device_ips" \ + --argjson llmdatadist_comm_port 26000 \ + --arg proxy_ip "0.0.0.0" \ + --argjson proxy_port "$register_port" \ + --argjson http_port "$prefill_port" \ + '{ + "kv_connector": $kv_connector, + "kv_buffer_device": $kv_buffer_device, + "kv_role": $kv_role, + "kv_parallel_size": $kv_parallel_size, + "kv_port": $kv_port, + "kv_connector_extra_config": { + "prefill_device_ips": $prefill_device_ips, + "decode_device_ips": $decode_device_ips, + "llmdatadist_comm_port": $llmdatadist_comm_port, + "proxy_ip": $proxy_ip, + "proxy_port": $proxy_port, + "http_port": $http_port + } + }') + + # start prefill instance + ASCEND_RT_VISIBLE_DEVICES=0 vllm serve $model_name \ + --host 0.0.0.0 \ + --port $prefill_port \ + --tensor-parallel-size $tp_size \ + --served-model-name Deepseek \ + --max-model-len 2000 \ + --trust-remote-code \ + --enforce-eager \ + --kv-transfer-config "$KV_CONFIG" +} + + + +function run_decode_instance() { + # Start decode instance + local model_name=$1 + local tp_size=$2 + local decode_port=$3 + local register_port=$4 + local prefill_device_ips=$5 + local decode_device_ips=$6 + + KV_CONFIG=$(jq -n \ + --arg kv_connector "AscendSimpleConnector" \ + --arg kv_buffer_device "npu" \ + --arg kv_role "kv_consumer" \ + --argjson kv_parallel_size 8 \ + --arg kv_port 21001 \ + --argjson prefill_device_ips "$prefill_device_ips" \ + --argjson decode_device_ips "$decode_device_ips" \ + --argjson llmdatadist_comm_port 26000 \ + --arg proxy_ip "0.0.0.0" \ + --argjson proxy_port "$register_port" \ + --argjson http_port "$decode_port" \ + '{ + "kv_connector": $kv_connector, + "kv_buffer_device": $kv_buffer_device, + "kv_role": $kv_role, + "kv_parallel_size": $kv_parallel_size, + "kv_port": $kv_port, + "kv_connector_extra_config": { + "prefill_device_ips": $prefill_device_ips, + "decode_device_ips": $decode_device_ips, + "llmdatadist_comm_port": $llmdatadist_comm_port, + "proxy_ip": $proxy_ip, + "proxy_port": $proxy_port, + "http_port": $http_port + } + }') + + # start decode instance + ASCEND_RT_VISIBLE_DEVICES=1 vllm serve $model_name \ + --host 0.0.0.0 \ + --port $decode_port \ + --tensor-parallel-size $tp_size \ + --seed 1024 \ + --served-model-name Deepseek \ + --max-model-len 2000 \ + --max-num-batched-tokens 2000 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --enforce-eager \ + --kv-transfer-config "$KV_CONFIG" +} + +function run_proxy_server() { + # Build the command for the proxy server with all the hosts and ports + register_port=$1 + proxy_port=$2 + PROXY_CMD="python examples/disaggregated_prefill/p2p_disaggrefated_prefill_proxy.py --http-port $proxy_port --register-port $register_port" + + # Start the proxy server + echo "Starting proxy server with command: $PROXY_CMD" + $PROXY_CMD & +} diff --git a/tests/e2e/pd_disaggreate/test_edge_cases.py b/tests/e2e/pd_disaggreate/test_edge_cases.py new file mode 100644 index 0000000..fe53ddc --- /dev/null +++ b/tests/e2e/pd_disaggreate/test_edge_cases.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# This code is from: https://github.com/vllm-project/vllm/blob/main/tests/v1/kv_connector/nixl_integration/test_edge_cases.py +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +import os + +import openai + +PREFILL_PORT = os.getenv("PREFILL_PORT", None) +DECODE_PORT = os.getenv("DECODE_PORT", None) +PROXY_PORT = os.getenv("PROXY_PORT", None) + +if PREFILL_PORT is None or DECODE_PORT is None or PROXY_PORT is None: + raise ValueError( + "Please set the PREFILL_PORT, DECODE_PORT, and PROXY_PORT.") + +LONG_PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result, when working on projects like vLLM we are able to meet many amazing people from various organizations like AMD, Google, NVIDIA, " # noqa: E501 +PROMPT = "Red Hat is the best company in the world to work for because it works on open source software, which means that all the contributions are delivered to the community. As a result," # noqa: E501 +SHORT_PROMPT = "Red Hat is " + + +def test_edge_cases(): + # Set the OpenAI API key and base URL + decode_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{DECODE_PORT}/v1", + ) + prefill_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{PREFILL_PORT}/v1", + ) + proxy_client = openai.OpenAI( + api_key="MY_KEY", + base_url=f"http://localhost:{PROXY_PORT}/v1", + ) + + # Get the list of models + models = decode_client.models.list() + MODEL = models.data[0].id + + # (1) Check that we can handle a very short prompt, + # less than the length of the block size. + completion = proxy_client.completions.create(model=MODEL, + prompt=SHORT_PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + completion = prefill_client.completions.create(model=MODEL, + prompt=SHORT_PROMPT, + temperature=0) + prefill_response = completion.choices[0].text + print(f"SMALL PROMPT: {proxy_response=}") + print(f"SMALL PROMPT: {prefill_response=}") + assert proxy_response == prefill_response + + # (2) Check that we can handle a full prefix cache + # hit on the D worker but not on the P worker. + # (2a): prime the D worker. + completion = decode_client.completions.create(model=MODEL, + prompt=PROMPT, + temperature=0) + decode_response = completion.choices[0].text + # (2b): send via the P/D setup + completion = proxy_client.completions.create(model=MODEL, + prompt=PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + print(f"FULL CACHE HIT: {proxy_response=}") + assert proxy_response == decode_response + + # (3) Check that we can handle a partial prefix cache + # hit on the D worker. + completion = proxy_client.completions.create(model=MODEL, + prompt=LONG_PROMPT, + temperature=0) + proxy_response = completion.choices[0].text + completion = prefill_client.completions.create(model=MODEL, + prompt=LONG_PROMPT, + temperature=0) + prefill_response = completion.choices[0].text + print(f"PARTIAL CACHE HIT: {proxy_response=}") + assert proxy_response == prefill_response \ No newline at end of file diff --git a/tests/e2e/pd_disaggreate/test_pd_e2e.py b/tests/e2e/pd_disaggreate/test_pd_e2e.py new file mode 100644 index 0000000..5fd9232 --- /dev/null +++ b/tests/e2e/pd_disaggreate/test_pd_e2e.py @@ -0,0 +1,109 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os +import signal +import subprocess +import time + +import psutil +import requests + + +def kill_process_and_children(pid): + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + print(f"Killing child process {child.pid}") + child.kill() + print(f"Killing parent process {pid}") + parent.kill() + except psutil.NoSuchProcess: + pass + + +def kill_all_vllm_related(): + current_pid = os.getpid() + + for proc in psutil.process_iter(['pid', 'cmdline']): + try: + if proc.pid == current_pid: + continue + cmd = ' '.join(proc.info['cmdline']) + if "vllm" in cmd or "proxy" in cmd or "engine_worker" in cmd: + kill_process_and_children(proc.pid) + except Exception: + continue + + +PROXY_PORT = 10102 +DECODE_PORT = 8002 + +SCRIPT_PATH = os.path.abspath("./tests/e2e/run_disagg_pd.sh") + + +def wait_for_port(port, timeout=30): + import socket + start = time.time() + while time.time() - start < timeout: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + if sock.connect_ex(("127.0.0.1", port)) == 0: + return True + time.sleep(1) + raise TimeoutError(f"Port {port} not ready after {timeout}s") + + +def start_and_test_pipeline(): + print("Launching bash script to run vLLM PD setup...") + proc = subprocess.Popen(["bash", SCRIPT_PATH]) + try: + print("Waiting for proxy port to be available...") + wait_for_port(PROXY_PORT, 180) + wait_for_port(DECODE_PORT, 600) + + # request + payload = { + "model": "Deepseek", + "prompt": "The future of AI is", + "max_tokens": 64, + "temperature": 0, + } + response = requests.post( + f"http://localhost:{PROXY_PORT}/v1/completions", + headers={"Content-Type": "application/json"}, + json=payload, + timeout=10) + assert response.status_code == 200, f"HTTP failed: {response.status_code}" + result = response.json() + print("Response:", result) + assert "text" in result["choices"][0] + assert len(result["choices"][0]["text"].strip()) > 0 + + finally: + # clean up subprocesses + print("Cleaning up subprocess...") + proc.send_signal(signal.SIGINT) + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + kill_all_vllm_related() + + +def test_disaggregated_pd_pipeline(): + start_and_test_pipeline() diff --git a/tests/e2e/prompts/example.txt b/tests/e2e/prompts/example.txt new file mode 100644 index 0000000..e1b97bc --- /dev/null +++ b/tests/e2e/prompts/example.txt @@ -0,0 +1,8 @@ +vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. +Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020. +Compare and contrast artificial intelligence with human intelligence in terms of processing information. +Describe the basic components of a neural network and how it can be trained. +Write a short story about a robot that dreams for the first time. +Analyze the impact of the COVID-19 pandemic on global economic structures and future business models. +Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies. +Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.' diff --git a/tests/e2e/run_disagg_pd.sh b/tests/e2e/run_disagg_pd.sh new file mode 100644 index 0000000..99d0faa --- /dev/null +++ b/tests/e2e/run_disagg_pd.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +set -eo errexit + +. $(dirname "$0")/common.sh +. $(dirname "$0")/pd_disaggreate/setup_pd.sh + +export VLLM_USE_MODELSCOPE="True" + +MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite" +# TODO: add tp case +TP_SIZE=1 + +# TODO: support multi-card +prefill_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 0 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs) +PREFILL_DEVICE_IPS="[\"$prefill_ip\"]" + +decode_ip=$(/usr/local/Ascend/driver/tools/hccn_tool -i 1 -ip -g | grep "ipaddr" | awk -F: '{print $2}' | xargs) +DECODE_DEVICE_IPS="[\"$decode_ip\"]" + +_info "====> Start pd disaggregated test" +REGISTER_PORT=10101 +PREOXY_PORT=10102 +run_proxy_server $REGISTER_PORT $PREOXY_PORT +_info "Started pd disaggregated proxy server" + +PREFILL_PROC_NAME="Prefill-instance" +PREFILL_PORT=8001 +_info "Starting prefill instance" +run_prefill_instance $MODEL_NAME $TP_SIZE $PREFILL_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & +_info "Waiting for prefill instance ready" +wait_url_ready $PREFILL_PROC_NAME "http://localhost:${PREFILL_PORT}/v1/completions" + +DECODE_PROC_NAME="Decode-instance" +DECODE_PORT=8002 +_info "Starting decode instance" +run_decode_instance $MODEL_NAME $TP_SIZE $DECODE_PORT $REGISTER_PORT $PREFILL_DEVICE_IPS $DECODE_DEVICE_IPS & +_info "Waiting for decode instance ready" +wait_url_ready $DECODE_PROC_NAME "http://localhost:${DECODE_PORT}/v1/completions" + +_info "pd disaggregated system is ready for handling request" diff --git a/tests/e2e/run_doctests.sh b/tests/e2e/run_doctests.sh new file mode 100755 index 0000000..2b00b64 --- /dev/null +++ b/tests/e2e/run_doctests.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +set -eo errexit + +. $(dirname "$0")/common.sh + +export VLLM_USE_MODELSCOPE=true +export VLLM_LOGGING_LEVEL=ERROR + +_info "====> Start Quickstart test" +. "${SCRIPT_DIR}/doctests/001-quickstart-test.sh" + +_info "====> Start pip binary installation test" +. "${SCRIPT_DIR}/doctests/002-pip-binary-installation-test.sh" + +_info "Doctest passed." diff --git a/tests/e2e/singlecard/__init__.py b/tests/e2e/singlecard/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/singlecard/ops/__init__.py b/tests/e2e/singlecard/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/e2e/singlecard/ops/test_bgmv_expand.py b/tests/e2e/singlecard/ops/test_bgmv_expand.py new file mode 100644 index 0000000..0aca9ca --- /dev/null +++ b/tests/e2e/singlecard/ops/test_bgmv_expand.py @@ -0,0 +1,46 @@ +import gc + +import torch + +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() + +DEFAULT_ATOL = 1e-3 +DEFAULT_RTOL = 1e-3 + + +def bgmv_expand_cpu_impl(x: torch.Tensor, w: torch.Tensor, + indices: torch.Tensor, y: torch.tensor, + slice_offset: int, slice_size: int) -> torch.Tensor: + W = w[indices, :, :].transpose(-1, -2).to(torch.float32) + z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze() + y[:, slice_offset:slice_offset + slice_size] += z + return y + + +@torch.inference_mode() +def test_bgmv_expand(): + B = 1 + x = torch.randn([B, 16], dtype=torch.float) + w = torch.randn([64, 128, 16], dtype=torch.float16) + indices = torch.zeros([B], dtype=torch.int64) + y = torch.randn([B, 128 * 3], dtype=torch.float16) + + x_npu = x.npu() + w_npu = w.npu() + indices_npu = indices.npu() + y_npu = y.npu() + + y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128) + y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0, + 128) + + # Compare the results. + torch.testing.assert_close(y_out_npu.cpu(), + y_out, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/ops/test_bgmv_shrink.py b/tests/e2e/singlecard/ops/test_bgmv_shrink.py new file mode 100644 index 0000000..99bb8e8 --- /dev/null +++ b/tests/e2e/singlecard/ops/test_bgmv_shrink.py @@ -0,0 +1,45 @@ +import gc + +import torch + +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() + +DEFAULT_ATOL = 1e-3 +DEFAULT_RTOL = 1e-3 + + +def bgmv_shrink_cpu_impl(x: torch.Tensor, w: torch.Tensor, + indices: torch.Tensor, y: torch.tensor, + scaling: float) -> torch.Tensor: + W = w[indices, :, :].transpose(-1, -2).to(torch.float32) + z = torch.bmm(x.unsqueeze(1).to(torch.float32), W).squeeze() + y[:, :] += z * scaling + return y + + +@torch.inference_mode() +def test_bgmv_shrink(): + B = 1 + x = torch.randn([B, 128], dtype=torch.float16) + w = torch.randn([64, 16, 128], dtype=torch.float16) + indices = torch.zeros([B], dtype=torch.int64) + y = torch.zeros([B, 16]) + + x_npu = x.npu() + w_npu = w.npu() + indices_npu = indices.npu() + y_npu = y.npu() + + y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5) + torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5) + + # Compare the results. + torch.testing.assert_close(y_npu.cpu(), + y, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/ops/test_fused_moe.py b/tests/e2e/singlecard/ops/test_fused_moe.py new file mode 100644 index 0000000..cf13010 --- /dev/null +++ b/tests/e2e/singlecard/ops/test_fused_moe.py @@ -0,0 +1,284 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/kernels/test_moe.py +"""Tests for the MOE layers. + +Run `pytest tests/ops/test_fused_moe.py`. +""" + +import gc +from unittest.mock import MagicMock, patch + +import pytest +import torch +import torch_npu +from vllm.model_executor.layers.activation import SiluAndMul + +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \ + TokenDispatcherWithAllGather + +NUM_EXPERTS = [8, 64] +EP_SIZE = [1, 4] +TOP_KS = [2, 6] +DEVICE = ["npu"] + + +def apply_mlp( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + group_list: torch.Tensor, + group_list_type: int = 1, +) -> torch.Tensor: + w1 = w1.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + + hidden_states = torch_npu.npu_swiglu(hidden_states) + + w2 = w2.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + + return hidden_states + + +def torch_moe(a, w1, w2, topk_weights, topk_ids, topk, expert_map): + B, D = a.shape + a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D) + out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device) + topk_weights = topk_weights.view(-1) + topk_ids = topk_ids.view(-1) + if expert_map is not None: + topk_ids = expert_map[topk_ids] + for i in range(w1.shape[0]): + mask = topk_ids == i + if mask.sum(): + out[mask] = SiluAndMul()( + a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) + return (out.view(B, -1, w2.shape[1]) * + topk_weights.view(B, -1, 1).to(out.dtype)).sum(dim=1) + + +@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) +@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("ep_size", EP_SIZE) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("device", DEVICE) +def test_token_dispatcher_with_all_gather( + m: int, + n: int, + k: int, + e: int, + topk: int, + ep_size: int, + dtype: torch.dtype, + device: str, +): + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10 + + score = torch.randn((m, e), device=device, dtype=dtype) + expert_map = None + local_e = e + w1_local = w1 + w2_local = w2 + + if ep_size > 1: + local_e = e // ep_size + e_ids = torch.arange(local_e * 0, + local_e * (0 + 1), + device=device, + dtype=torch.int32) + expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32) + expert_map[e_ids] = torch.arange(local_e, + device=device, + dtype=torch.int32) + w1_local = w1[e_ids] + w2_local = w2[e_ids] + + score = torch.softmax(score, dim=-1, dtype=dtype) + topk_weights, topk_ids = torch.topk(score, topk) + topk_ids = topk_ids.to(torch.int32) + row_idx = (torch.arange( + 0, + m * topk, + device=device, + dtype=torch.int32, + ).view(topk, -1).permute(1, 0).contiguous()) + + dispatcher_kwargs = { + "num_experts": e, + "top_k": topk, + "num_local_experts": local_e, + } + dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs) + + apply_router_weight_on_input = False + dispatch_output = dispatcher.token_dispatch( + hidden_states=a, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=row_idx, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input) + + sorted_hidden_states = dispatch_output["hidden_states"] + group_list = dispatch_output["group_list"] + group_list_type = dispatch_output.get("group_list_type", 1) + + expert_output = apply_mlp(hidden_states=sorted_hidden_states, + w1=w1_local, + w2=w2_local, + group_list=group_list, + group_list_type=group_list_type) + + combined_output = dispatcher.token_combine(hidden_states=expert_output, + bias=None) + + torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, + expert_map) + + torch.testing.assert_close(combined_output, + torch_output, + atol=4e-2, + rtol=1) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +@pytest.mark.parametrize("m", [1, 33, 64]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("scoring_func", ["softmax", "sigmoid"]) +@pytest.mark.parametrize("use_grouped_topk", [True, False]) +@pytest.mark.parametrize("renormalize", [True, False]) +@pytest.mark.parametrize("with_e_correction", [True, False]) +@pytest.mark.parametrize("custom_routing", [True, False]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("device", DEVICE) +def test_select_experts( + m: int, + n: int, + e: int, + topk: int, + scoring_func: str, + use_grouped_topk: bool, + renormalize: bool, + with_e_correction: bool, + custom_routing: bool, + dtype: torch.dtype, + device: str, +): + topk_group = 4 if use_grouped_topk else None + num_expert_group = e // 4 if use_grouped_topk else None + + hidden_states = torch.randn(m, n, device=device, dtype=dtype) + router_logits = torch.randn(m, e, device=device, dtype=dtype) + + e_score_correction_bias = (torch.randn(e, device=device, dtype=dtype) + if with_e_correction else None) + + custom_routing_function = None + if custom_routing: + custom_routing_function = MagicMock() + mock_weights = torch.randn(m, topk, device=device, dtype=dtype) + mock_ids = torch.randint(0, + e, (m, topk), + device=device, + dtype=torch.int32) + custom_routing_function.return_value = (mock_weights, mock_ids) + + with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk" + ) as mock_native_grouped_topk: + mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like( + x) + + topk_weights, topk_ids, row_idx = select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=topk, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + if use_grouped_topk: + mock_native_grouped_topk.assert_called_once() + else: + mock_native_grouped_topk.assert_not_called() + + assert topk_weights.shape == (m, topk) + assert topk_ids.shape == (m, topk) + assert topk_ids.dtype == torch.int32 + assert row_idx.shape == (m, topk) + + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +@pytest.mark.parametrize("device", DEVICE) +def test_select_experts_invalid_scoring_func(device: str): + with pytest.raises(ValueError, + match="Unsupported scoring function: invalid"): + select_experts(hidden_states=torch.randn(1, 128, device=device), + router_logits=torch.randn(1, 8, device=device), + top_k=2, + use_grouped_topk=False, + renormalize=False, + scoring_func="invalid") + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +@pytest.mark.parametrize("device", DEVICE) +def test_select_experts_missing_group_params(device: str): + with pytest.raises(AssertionError): + select_experts(hidden_states=torch.randn(1, 128, device=device), + router_logits=torch.randn(1, 64, device=device), + top_k=2, + use_grouped_topk=True, + renormalize=False, + scoring_func="softmax") + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/ops/test_gating_top_k_softmax.py b/tests/e2e/singlecard/ops/test_gating_top_k_softmax.py new file mode 100644 index 0000000..4edcdfd --- /dev/null +++ b/tests/e2e/singlecard/ops/test_gating_top_k_softmax.py @@ -0,0 +1,37 @@ +import pytest +import torch +import torch_npu + + +@pytest.mark.parametrize( + 'B', + [1, 16, 64, 128, 32768], +) +@pytest.mark.parametrize( + 'D', + [8, 16, 32, 64, 128], +) +@pytest.mark.parametrize( + 'top_k', + [1, 2, 4, 8], +) +@pytest.mark.parametrize( + "dtype, atol, rtol", + [ + (torch.float16, 1e-3, 1e-3), + (torch.bfloat16, 1e-3, 1e-3), + ], +) +def test_quant_fpx_linear(B: int, D: int, top_k: int, dtype, atol, rtol): + x = torch.rand((B, D), dtype=dtype).to("npu") + # finished = torch.randint(1, size=(B,), dtype=torch.bool).to("npu") + finished = None + y, expert_idx, row_idx = torch_npu.npu_moe_gating_top_k_softmax(x, + finished, + k=top_k) + + topk_weights = x.softmax(dim=-1) + topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1) + topk_ids = topk_ids.to(torch.int32) + torch.allclose(y, topk_weights, atol=atol, rtol=rtol) + torch.allclose(expert_idx, topk_ids, atol=atol, rtol=rtol) diff --git a/tests/e2e/singlecard/ops/test_moe_comm.py b/tests/e2e/singlecard/ops/test_moe_comm.py new file mode 100644 index 0000000..b034ed4 --- /dev/null +++ b/tests/e2e/singlecard/ops/test_moe_comm.py @@ -0,0 +1,175 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +import gc +from types import SimpleNamespace + +import pytest +import torch + +from vllm.model_executor.layers.fused_moe.config import ( # isort: skip + FusedMoEConfig, FusedMoEParallelConfig) + +from vllm_ascend.distributed.moe_comm_method import ( # isort: skip + AllGatherCommImpl, NativeAllGatherCommImpl) + + +@pytest.mark.parametrize("num_tokens", [16, 128]) +@pytest.mark.parametrize("hidden_size", [64, 128]) +@pytest.mark.parametrize("global_num_experts", [8, 16]) +@pytest.mark.parametrize("num_local_experts", [4, 8]) +@pytest.mark.parametrize("top_k_num", [2, 4]) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +@pytest.mark.parametrize("ep_rank", [0, 1]) +@pytest.mark.parametrize("apply_a8_quantization", [False]) +def test_all_gather_comm_impl( + num_tokens, + hidden_size, + global_num_experts, + num_local_experts, + top_k_num, + dtype, + ep_rank, + apply_a8_quantization, + mocker, +): + """ + Tests the AllGatherCommImpl against the NativeAllGatherCommImpl. + + This test compares the outputs of the NPU-optimized AllGatherCommImpl + with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure + correctness across various configurations. + """ + if top_k_num > global_num_experts: + pytest.skip("top_k_num cannot be greater than global_num_experts") + if num_local_experts > global_num_experts: + pytest.skip( + "num_local_experts cannot be greater than global_num_experts") + + device = torch.device("npu") + + # mock get_tensor_model_parallel_rank to return ep_rank + mocker.patch( + "vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank", + return_value=ep_rank, + ) + + # make moe config + parallel_config = SimpleNamespace( + enable_expert_parallel=num_local_experts < global_num_experts) + moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make( + tp_size_=max(2, global_num_experts // num_local_experts), + dp_size_=1, + vllm_parallel_config=parallel_config, + ) + + moe_config = FusedMoEConfig( + num_experts=global_num_experts, + experts_per_token=top_k_num, + hidden_dim=hidden_size, + num_local_experts=num_local_experts, + moe_parallel_config=moe_parallel_config, + in_dtype=dtype, + quant_config=None, # No quantization in this test + max_num_tokens=num_tokens, + ) + + # Instantiate implementations + native_impl = NativeAllGatherCommImpl(moe_config) + + all_gather_impl = AllGatherCommImpl(moe_config) + + # --- Input Data --- + hidden_states = torch.randn(num_tokens, + hidden_size, + device=device, + dtype=dtype) + topk_ids = torch.randint(0, + global_num_experts, (num_tokens, top_k_num), + device=device, + dtype=torch.int32) + topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype) + topk_weights = torch.nn.functional.softmax(topk_weights, dim=1) + + num_experts = global_num_experts + + expert_map = None + if num_local_experts < global_num_experts: + # Create a map where some experts are local and some are not + expert_map = torch.full((global_num_experts, ), -1, device=device) + expert_map[ep_rank * num_local_experts:(ep_rank + 1) * + num_local_experts] = torch.arange(num_local_experts, + device=device) + num_experts = num_local_experts + + # --- Run Native Implementation (Golden Reference) --- + native_hidden_states_out = hidden_states.clone() + ( + native_permuted_hidden, + native_expert_tokens, + _, + _, + ) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map, + num_experts, apply_a8_quantization) + # Simulate MLP output + native_mlp_output = torch.randn_like(native_permuted_hidden) + native_impl.unpermute(native_mlp_output, native_hidden_states_out) + + # --- Run AllGather Implementation --- + all_gather_hidden_states_out = hidden_states.clone() + ( + all_gather_permuted_hidden, + all_gather_expert_tokens, + _, + _, + ) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights, + expert_map, num_experts, apply_a8_quantization) + + # Use the same simulated MLP output for a fair comparison + all_gather_mlp_output = native_mlp_output.clone() + + all_gather_impl.unpermute(all_gather_mlp_output, + all_gather_hidden_states_out) + + # --- Assertions --- + # Define tolerance based on dtype + atol = 1e-3 if dtype == torch.float16 else 1e-2 + rtol = 1e-3 if dtype == torch.float16 else 1e-2 + + # 1. Compare expert_tokens from pre_process + assert torch.allclose(native_expert_tokens.to( + all_gather_expert_tokens.device), + all_gather_expert_tokens, + atol=atol, + rtol=rtol), "Expert tokens do not match." + + # 2. Compare permuted_hidden_states from pre_process + num_valid_tokens = native_expert_tokens.sum() + assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to( + all_gather_permuted_hidden.device), + all_gather_permuted_hidden[:num_valid_tokens], + atol=atol, + rtol=rtol), "Permuted hidden states do not match." + + # 3. Compare final hidden_states from post_process + assert torch.allclose(native_hidden_states_out.to( + all_gather_hidden_states_out.device), + all_gather_hidden_states_out, + atol=atol, + rtol=rtol), "Final hidden states do not match." + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py new file mode 100644 index 0000000..6f513b2 --- /dev/null +++ b/tests/e2e/singlecard/ops/test_rotary_embedding.py @@ -0,0 +1,351 @@ +# Copyright 2023 The vLLM team. + +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2025. All rights reserved. +# Adapted from +# https://github.com/vllm-project/vllm/blob/main/vllm/tests/kernels/test_rotary_embedding.py + +import gc +from typing import Optional, Tuple, Union + +import pytest +import torch +import torch.nn as nn + +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() + +# Only Neox style true scenario is supported for now +IS_NEOX_STYLE = [True] +DTYPES = [torch.half] +HEAD_SIZES = [64, 64, 96, 128, 256] +ROTARY_DIMS = [None, 32] # None means rotary dim == head size +NUM_HEADS = [17] # Arbitrary values for testing +BATCH_SIZES = [5] # Arbitrary values for testing +SEQ_LENS = [11, 4096] # Arbitrary values for testing +NUM_TOKENS = [10, 21] +SEEDS = [0] +DEVICES = [f"npu:{0}"] +# Set tolerance to 1 for quant ops +DEFAULT_ATOL = 1e-3 +DEFAULT_RTOL = 1e-3 + + +def _apply_rotary_emb( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> torch.Tensor: + """ + Args: + x: [num_tokens, num_heads, head_size] + cos: [num_tokens, head_size // 2] + sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. + """ + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + x1 = x[..., ::2] + x2 = x[..., 1::2] + o1 = x1 * cos - x2 * sin + o2 = x2 * cos + x1 * sin + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) + + +# adapted from https://github.com/vllm-project/vllm/vllm/model_executor/layers/rotary_embedding.py +class RotaryEmbedding(nn.Module): + """Original rotary positional embedding.""" + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.head_size = head_size + self.rotary_dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.is_neox_style = is_neox_style + self.dtype = dtype + + cache = self._compute_cos_sin_cache() + cache = cache.to(dtype) + self.cos_sin_cache: torch.Tensor + self.register_buffer("cos_sin_cache", cache, persistent=False) + + def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + """Compute the inverse frequency.""" + # NOTE(woosuk): To exactly match the HF implementation, we need to + # use CPU to compute the cache and then move it to GPU. However, we + # create the cache on GPU for faster initialization. This may cause + # a slight numerical difference between the HF implementation and ours. + inv_freq = 1.0 / (base**(torch.arange( + 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) + return inv_freq + + def _compute_cos_sin_cache(self) -> torch.Tensor: + """Compute the cos and sin cache.""" + inv_freq = self._compute_inv_freq(self.base) + t = torch.arange(self.max_position_embeddings, dtype=torch.float) + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + return cache + + def forward_native( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """A PyTorch-native implementation of forward().""" + if offsets is not None: + positions = positions + offsets + positions = positions.flatten() + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions) + cos, sin = cos_sin.chunk(2, dim=-1) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + + +# test with leading dimension and merge seqlen and batch_size as num_tokens +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_rotary_embedding_quant_with_leading_dim( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + if rotary_dim is None: + rotary_dim = head_size + + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + rope = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style, dtype) + rope = rope.to(dtype=dtype) + num_tokens = batch_size * seq_len + positions = torch.randint(0, max_position, (batch_size * seq_len, )) + qkv_tensor = torch.randn(num_tokens, + num_heads * head_size * 3, + dtype=dtype) + query, key, _ = qkv_tensor.split( + [num_heads * head_size, num_heads * head_size, num_heads * head_size], + dim=-1, + ) + + ref_query, ref_key = rope.forward_native(positions, query, key) + query, key = torch.ops._C.rotary_embedding( + positions, + query, + key, + rope.head_size, + rope.cos_sin_cache, + rope.is_neox_style, + ) + + # Compare the results. + torch.testing.assert_close(query.view(ref_query.size()), + ref_query, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + torch.testing.assert_close(key.view(ref_key.size()), + ref_key, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +class ModelwithRotaryEmbedding(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__() + self.qkv_proj = nn.Linear(hidden_size, num_heads * head_size * 3) + self.rope = RotaryEmbedding( + head_size=head_size, + rotary_dim=rotary_dim, + max_position_embeddings=max_position_embeddings, + base=base, + is_neox_style=is_neox_style, + dtype=dtype, + ) + self.o_proj = nn.Linear(num_heads * head_size, hidden_size) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph + qkv = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(3, dim=-1) + query, key = torch.ops._C.rotary_embedding( + positions, + q, + k, + self.rope.head_size, + self.rope.cos_sin_cache, + self.rope.is_neox_style, + ) + query = query.view(q.shape) + key = key.view(k.shape) + o = self.o_proj(query) + return o + + +# The first graph seems will have some accuracy issue when directly run pytest on the ops folder, +# add a warmup graph replay for workaround +ACL_GRPAH_FIRST_RUN = True + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("num_tokens", BATCH_SIZES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_capture_rotary_embedding_in_aclgraph( + is_neox_style: bool, + num_tokens: int, + num_heads: int, + head_size: int, + rotary_dim: int, + dtype: torch.dtype, + seed: int, + device: str, + max_position_embeddings: int = 8192, + base: int = 10000, +): + """Test if the rotary embedding can be captured in aclgraph.""" + torch.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + model = ModelwithRotaryEmbedding( + hidden_size=num_heads * head_size, + num_heads=num_heads, + head_size=head_size, + rotary_dim=rotary_dim, + max_position_embeddings=max_position_embeddings, + base=base, + is_neox_style=is_neox_style, + dtype=dtype, + ) + + def custom_op_checking_backend(gm: torch.fx.GraphModule, example_input): + # Validate if the rotary_embedding custom kernel is indeed inside the graph by + # string match + graph = str(gm.graph) + assert "_C.rotary_embedding" in graph + return gm + + static_positions = torch.randint(0, max_position_embeddings, + (num_tokens, )) + static_hidden_states = torch.randn(num_tokens, + num_heads * head_size, + dtype=dtype, + device="npu") + compiled_model = torch.compile(model, backend=custom_op_checking_backend) + stream = torch.npu.Stream() + stream.wait_stream(torch.npu.current_stream()) + with torch.npu.stream(stream): + # warmup the fx graph before capture + for i in range(3): + static_output = compiled_model(static_positions, + static_hidden_states, + offsets=None) + stream.wait_stream(torch.npu.current_stream()) + + aclgraph = torch.npu.NPUGraph() + + with torch.npu.graph(aclgraph): + # Capture the model in aclgraph. + static_output = compiled_model(static_positions, static_hidden_states) + # Capture the model in aclgraph. + random_filled_positions = torch.randint(0, + max_position_embeddings, + (num_tokens, ), + device="npu") + random_filled_hidden_states = torch.randn(num_tokens, + num_heads * head_size, + dtype=dtype, + device="npu") + static_positions.copy_(random_filled_positions) + static_hidden_states.copy_(random_filled_hidden_states) + + aclgraph.replay() + global ACL_GRPAH_FIRST_RUN + if ACL_GRPAH_FIRST_RUN: + ACL_GRPAH_FIRST_RUN = False + return + output_reference = model(static_positions, static_hidden_states) + torch.testing.assert_close(static_output, + output_reference, + atol=DEFAULT_ATOL, + rtol=DEFAULT_RTOL) + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/ops/test_vocabparallelembedding.py b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py new file mode 100644 index 0000000..54d1127 --- /dev/null +++ b/tests/e2e/singlecard/ops/test_vocabparallelembedding.py @@ -0,0 +1,98 @@ +import gc +from typing import Tuple + +import pytest +import torch +import torch_npu # noqa: F401 + +import vllm_ascend.platform # noqa: F401 +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() + +# Test parameters +DTYPES = [torch.int32] +#SHAPES = [(100,), (5, 20), (3, 4, 5)] # Various tensor shapes +#SHAPES = [(3, 4, 8), (3, 4, 5)] # Various tensor shapes +SHAPES = [(3, 4, 3)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Get reference result + ref_masked_input, ref_mask = get_masked_input_and_mask_ref( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + # Get custom op result + print("input_tensor:", input_tensor) + custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask( + input_tensor, test_case["org_start"], test_case["org_end"], + test_case["padding"], test_case["added_start"], test_case["added_end"]) + + ref_masked_input = ref_masked_input.to(dtype) + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + # Compare results + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}") + torch.testing.assert_close(custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}") + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py new file mode 100644 index 0000000..0c01a07 --- /dev/null +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import os + +import pytest +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +@pytest.fixture +def sampling_config(): + return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False) + + +@pytest.fixture +def model_name(): + return "wemaster/deepseek_mtp_main_random_bf16" + + +def test_mtp_correctness( + sampling_config: SamplingParams, + model_name: str, +): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + ''' + Compare the outputs of a original LLM and a speculative LLM + should be the same when using mtp speculative decoding. + ''' + with VllmRunner(model_name, + tensor_parallel_size=1, + gpu_memory_utilization=0.7, + max_model_len=256, + enforce_eager=True) as ref_llm: + ref_outputs = ref_llm.generate(example_prompts, sampling_config) + + with VllmRunner( + model_name, + tensor_parallel_size=1, + max_num_seqs=256, + gpu_memory_utilization=0.7, + distributed_executor_backend="mp", + enable_expert_parallel=True, + speculative_config={ + "method": "deepseek_mtp", + "num_speculative_tokens": 1, + }, + enforce_eager=True, + max_model_len=2000, + additional_config={"ascend_scheduler_config": { + "enabled": False + }}) as spec_llm: + spec_outputs = spec_llm.generate(example_prompts, sampling_config) + + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + ref_token_ids = ref_output[0][0] + spec_token_ids = spec_output[0][0] + if ref_token_ids == spec_token_ids[:len(ref_token_ids)]: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output[1][0]}") + print(f"spec_output: {spec_output[1][0]}") + + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py new file mode 100644 index 0000000..1bf6fea --- /dev/null +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import os + +import pytest +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +@pytest.fixture +def sampling_config(): + return SamplingParams(temperature=0, max_tokens=256, ignore_eos=False) + + +@pytest.fixture +def model_name(): + return "wemaster/deepseek_mtp_main_random_bf16" + + +def test_mtp_torchair_correctness( + sampling_config: SamplingParams, + model_name: str, +): + example_prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + ''' + Compare the outputs of a original LLM and a speculative LLM + should be the same when using mtp speculative decoding. + ''' + with VllmRunner(model_name, + tensor_parallel_size=1, + gpu_memory_utilization=0.7, + max_model_len=256, + enforce_eager=False, + additional_config={ + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": False, + "graph_batch_sizes": [1, 2, 4], + }, + }) as ref_llm: + ref_outputs = ref_llm.generate(example_prompts, sampling_config) + with VllmRunner(model_name, + tensor_parallel_size=1, + max_num_seqs=256, + gpu_memory_utilization=0.7, + distributed_executor_backend="mp", + enable_expert_parallel=True, + speculative_config={ + "method": "deepseek_mtp", + "num_speculative_tokens": 1, + }, + enforce_eager=False, + max_model_len=2000, + additional_config={ + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": False, + "graph_batch_sizes": [1, 2, 4], + } + }) as spec_llm: + spec_outputs = spec_llm.generate(example_prompts, sampling_config) + + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + ref_token_ids = ref_output[0][0] + spec_token_ids = spec_output[0][0] + if ref_token_ids == spec_token_ids[:len(ref_token_ids)]: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output[1][0]}") + print(f"spec_output: {spec_output[1][0]}") + + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py new file mode 100644 index 0000000..9a1bfb8 --- /dev/null +++ b/tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import random +from typing import Any + +import pytest +from vllm import LLM, SamplingParams + +from tests.e2e.conftest import VllmRunner + + +@pytest.fixture +def test_prompts(): + prompt_types = ["repeat", "sentence"] + num_prompts = 10 + prompts = [] + + random.seed(0) + random_prompt_type_choices = random.choices(prompt_types, k=num_prompts) + + # Generate a mixed batch of prompts, some of which can be easily + # predicted by n-gram matching and some which likely cannot. + for kind in random_prompt_type_choices: + word_choices = ["test", "temp", "hello", "where"] + word = random.choice(word_choices) + if kind == "repeat": + prompt = f""" + please repeat the word '{word}' 10 times. + give no other output than the word at least ten times in a row, + in lowercase with spaces between each word and without quotes. + """ + elif kind == "sentence": + prompt = f""" + please give a ten-word sentence that + uses the word {word} at least once. + give no other output than that simple sentence without quotes. + """ + else: + raise ValueError(f"Unknown prompt type: {kind}") + prompts.append([{"role": "user", "content": prompt}]) + + return prompts + + +@pytest.fixture +def sampling_config(): + return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False) + + +@pytest.fixture +def model_name(): + return "LLM-Research/Meta-Llama-3.1-8B-Instruct" + + +def eagle_model_name(): + return "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B" + + +def eagle3_model_name(): + return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B" + + +def test_ngram_correctness( + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, +): + ''' + Compare the outputs of a original LLM and a speculative LLM + should be the same when using ngram speculative decoding. + ''' + pytest.skip("Not current support for the test.") + ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + with VllmRunner(model_name, + speculative_config={ + "method": "ngram", + "prompt_lookup_max": 5, + "prompt_lookup_min": 3, + "num_speculative_tokens": 3, + }, + max_model_len=1024, + enforce_eager=True) as runner: + spec_outputs = runner.model.chat(test_prompts, sampling_config) + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") + + # Heuristic: expect at least 70% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.7 * len(ref_outputs)) + + +@pytest.mark.skipif(True, reason="oom in CI, fix me") +@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"]) +def test_eagle_correctness( + test_prompts: list[list[dict[str, Any]]], + sampling_config: SamplingParams, + model_name: str, + use_eagle3: bool, +): + ''' + Compare the outputs of a original LLM and a speculative LLM + should be the same when using eagle speculative decoding. + ''' + if not use_eagle3: + pytest.skip("Not current support for the test.") + + ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True) + ref_outputs = ref_llm.chat(test_prompts, sampling_config) + del ref_llm + + spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name() + with VllmRunner( + model_name, + trust_remote_code=True, + enable_chunked_prefill=True, + max_num_seqs=1, + max_num_batched_tokens=2048, + gpu_memory_utilization=0.6, + speculative_config={ + "method": "eagle3" if use_eagle3 else "eagle", + "model": spec_model_name, + "num_speculative_tokens": 2, + "max_model_len": 128, + }, + max_model_len=128, + enforce_eager=True, + ) as runner: + spec_outputs = runner.model.chat(test_prompts, sampling_config) + + matches = 0 + misses = 0 + for ref_output, spec_output in zip(ref_outputs, spec_outputs): + if ref_output.outputs[0].text == spec_output.outputs[0].text: + matches += 1 + else: + misses += 1 + print(f"ref_output: {ref_output.outputs[0].text}") + print(f"spec_output: {spec_output.outputs[0].text}") + + # Heuristic: expect at least 66% of the prompts to match exactly + # Upon failure, inspect the outputs to check for inaccuracy. + assert matches > int(0.66 * len(ref_outputs)) diff --git a/tests/e2e/singlecard/test_aclgraph.py b/tests/e2e/singlecard/test_aclgraph.py new file mode 100644 index 0000000..cf14a9e --- /dev/null +++ b/tests/e2e/singlecard/test_aclgraph.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/compile/test_aclgraph.py`. +""" + +import pytest +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal + +MODELS = [ + "Qwen/Qwen3-0.6B", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [32]) +def test_models_with_aclgraph( + model: str, + max_tokens: int, +) -> None: + prompts = [ + "Hello, my name is", "The president of the United States is", + "The capital of France is", "The future of AI is" + ] + + sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0) + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=False, + ) as runner: + vllm_aclgraph_outputs = runner.model.generate(prompts, sampling_params) + + with VllmRunner( + model, + max_model_len=1024, + enforce_eager=True, + ) as runner: + vllm_eager_outputs = runner.model.generate(prompts, sampling_params) + + vllm_aclgraph_outputs_list = [] + for output in vllm_aclgraph_outputs: + vllm_aclgraph_outputs_list.append( + (output.outputs[0].index, output.outputs[0].text)) + + vllm_eager_outputs_list = [] + for output in vllm_eager_outputs: + vllm_eager_outputs_list.append( + (output.outputs[0].index, output.outputs[0].text)) + + check_outputs_equal( + outputs_0_lst=vllm_eager_outputs_list, + outputs_1_lst=vllm_aclgraph_outputs_list, + name_0="vllm_eager_outputs", + name_1="vllm_aclgraph_outputs", + ) diff --git a/tests/e2e/singlecard/test_ascend_scheduler.py b/tests/e2e/singlecard/test_ascend_scheduler.py new file mode 100644 index 0000000..1a47ab6 --- /dev/null +++ b/tests/e2e/singlecard/test_ascend_scheduler.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.e2e.conftest import VllmRunner +from tests.e2e.model_utils import check_outputs_equal + +MODEL = "Qwen/Qwen3-0.6B" + + +def test_concurrent_partial_prefill(): + with VllmRunner(MODEL, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + }, + }, + max_num_seqs=3, + max_num_batched_tokens=2048, + enforce_eager=True, + max_model_len=2048, + gpu_memory_utilization=0.7) as vllm_model: + outputs = vllm_model.model.generate(["Hello my name is Robert and I"] * + 3) + assert len(outputs) == 3 + for output in outputs: + assert len(output.outputs) == 1 + + +def test_prefix_cache_stats_is_recorded(): + with VllmRunner(MODEL, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + }, + }, + max_num_seqs=3, + max_num_batched_tokens=2048, + enforce_eager=True, + max_model_len=2048, + gpu_memory_utilization=0.7) as vllm_model: + # 17 tokens will make sure first 16 tokens are cached in a block + input_tokens = {"prompt_token_ids": [101] * 129} + _ = vllm_model.model.generate([input_tokens]) + outputs = vllm_model.model.generate([input_tokens]) + assert outputs[0].num_cached_tokens == 128 + + +@pytest.mark.parametrize("max_tokens", + [4]) # cannot align results when max_tokens > 4 +@pytest.mark.parametrize("chunked_prefill_token_size", [16]) +def test_chunked_prefill_with_ascend_scheduler( + max_tokens: int, chunked_prefill_token_size: int) -> None: + example_prompts = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." + ] + max_num_seqs = chunked_prefill_token_size + max_num_batched_tokens = chunked_prefill_token_size + with VllmRunner(MODEL, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + 'enable_chunked_prefill': True, + }, + }, + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=2048, + gpu_memory_utilization=0.7) as vllm_model: + chunked_prefill_output = vllm_model.generate_greedy( + example_prompts, max_tokens) + + with VllmRunner(MODEL, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True, + }, + }, + max_model_len=2048, + gpu_memory_utilization=0.7) as vllm_model: + vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_output, + outputs_1_lst=chunked_prefill_output, + name_0="vllm_output", + name_1="chunked_prefill_output", + ) diff --git a/tests/e2e/singlecard/test_camem.py b/tests/e2e/singlecard/test_camem.py new file mode 100644 index 0000000..2ca8a1b --- /dev/null +++ b/tests/e2e/singlecard/test_camem.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import gc + +import torch +from vllm import SamplingParams +from vllm.utils import GiB_bytes + +from tests.e2e.conftest import VllmRunner +from tests.e2e.utils import fork_new_process_for_each_test +from vllm_ascend.device_allocator.camem import CaMemAllocator + + +@fork_new_process_for_each_test +def test_basic_camem(): + # some tensors from default memory pool + shape = (1024, 1024) + x = torch.empty(shape, device='npu:0') + x.zero_() + + # some tensors from custom memory pool + allocator = CaMemAllocator.get_instance() + with allocator.use_memory_pool(): + # custom memory pool + y = torch.empty(shape, device='npu:0') + y.zero_() + y += 1 + z = torch.empty(shape, device='npu:0') + z.zero_() + z += 2 + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + free_bytes = torch.npu.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.npu.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +@fork_new_process_for_each_test +def test_end_to_end(): + free, total = torch.npu.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + + with VllmRunner("Qwen/Qwen3-0.6B", + enforce_eager=True, + enable_sleep_mode=True) as runner: + + output = runner.model.generate(prompt, sampling_params) + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + runner.model.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.npu.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage should be less than the model weights + # (0.5B model, 1GiB weights) + assert used_bytes < 1 * GiB_bytes + + runner.model.wake_up() + output2 = runner.model.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/e2e/singlecard/test_chunked.py b/tests/e2e/singlecard/test_chunked.py new file mode 100644 index 0000000..40df8f8 --- /dev/null +++ b/tests/e2e/singlecard/test_chunked.py @@ -0,0 +1,81 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +Compare the outputs of vLLM with and without aclgraph. + +Run `pytest tests/compile/test_aclgraph.py`. +""" +import gc + +import pytest +import torch +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + +MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [1]) +def test_models( + model: str, + max_tokens: int, +) -> None: + prompts = ["The president of the United States is"] + + sampling_params = SamplingParams( + max_tokens=max_tokens, + temperature=0.0, + ) + + with VllmRunner(model, long_prefill_token_threshold=20, + enforce_eager=True) as vllm_model: + output1 = vllm_model.generate(prompts, sampling_params) + + with VllmRunner(model, + enforce_eager=True, + additional_config={ + 'ascend_scheduler_config': { + 'enabled': True + }, + }) as vllm_model: + output2 = vllm_model.generate(prompts, sampling_params) + + # Extract the generated token IDs for comparison + token_ids1 = output1[0][0][0] + token_ids2 = output2[0][0][0] + + print(f"Token IDs 1: {token_ids1}") + print(f"Token IDs 2: {token_ids2}") + + # Convert token IDs to tensors and calculate cosine similarity + # Take the length of a shorter sequence to ensure consistent dimensions + min_len = min(len(token_ids1), len(token_ids2)) + + tensor1 = torch.tensor(token_ids1[:min_len], dtype=torch.float32) + tensor2 = torch.tensor(token_ids2[:min_len], dtype=torch.float32) + + # Calculate similarity using torch.cosine_similarity + similarity = torch.cosine_similarity(tensor1, tensor2, dim=0) + print(f"Token IDs cosine similarity: {similarity.item()}") + + assert similarity > 0.95 + + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/test_embedding.py b/tests/e2e/singlecard/test_embedding.py new file mode 100644 index 0000000..4f85dd7 --- /dev/null +++ b/tests/e2e/singlecard/test_embedding.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +from modelscope import snapshot_download # type: ignore[import-untyped] + +from tests.e2e.conftest import HfRunner, VllmRunner +from tests.e2e.utils import check_embeddings_close + + +def test_embed_models_correctness(): + queries = ['What is the capital of China?', 'Explain gravity'] + + model_name = snapshot_download("Qwen/Qwen3-Embedding-0.6B") + with VllmRunner( + model_name, + task="embed", + enforce_eager=True, + ) as vllm_runner: + vllm_outputs = vllm_runner.encode(queries) + + with HfRunner( + model_name, + dtype="float32", + is_sentence_transformer=True, + ) as hf_runner: + hf_outputs = hf_runner.encode(queries) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) diff --git a/tests/e2e/singlecard/test_guided_decoding.py b/tests/e2e/singlecard/test_guided_decoding.py new file mode 100644 index 0000000..6cb1c7b --- /dev/null +++ b/tests/e2e/singlecard/test_guided_decoding.py @@ -0,0 +1,150 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import json +import os + +import jsonschema +import pytest +import regex as re +from vllm.outputs import RequestOutput +from vllm.sampling_params import GuidedDecodingParams, SamplingParams + +from tests.e2e.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" +MODEL_NAME = "Qwen/Qwen3-0.6B" + +GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"] + + +@pytest.fixture(scope="module") +def sample_regex(): + return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") + + +@pytest.fixture(scope="module") +def sample_json_schema(): + return { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "skills": { + "type": "array", + "items": { + "type": "string", + "maxLength": 10 + }, + "minItems": 3 + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": { + "type": "string" + }, + "duration": { + "type": "number" + }, + "position": { + "type": "string" + } + }, + "required": ["company", "position"] + } + } + }, + "required": ["name", "age", "skills", "work_history"] + } + + +@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) +def test_guided_json_completion(guided_decoding_backend: str, + sample_json_schema): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=500, + guided_decoding=GuidedDecodingParams(json=sample_json_schema)) + + with VllmRunner( + MODEL_NAME, + seed=0, + guided_decoding_backend=guided_decoding_backend, + ) as vllm_model: + prompts = [ + f"Give an example JSON for an employee profile " + f"that fits this schema: {sample_json_schema}" + ] * 2 + inputs = vllm_model.get_inputs(prompts) + outputs = vllm_model.model.generate(inputs, + sampling_params=sampling_params) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, + schema=sample_json_schema) + + +@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend) +def test_guided_regex(guided_decoding_backend: str, sample_regex): + if guided_decoding_backend == "outlines": + pytest.skip("Outlines doesn't support regex-based guided decoding.") + + sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + guided_decoding=GuidedDecodingParams(regex=sample_regex)) + + with VllmRunner( + MODEL_NAME, + seed=0, + guided_decoding_backend=guided_decoding_backend, + ) as vllm_model: + prompts = [ + f"Give an example IPv4 address with this regex: {sample_regex}" + ] * 2 + inputs = vllm_model.get_inputs(prompts) + outputs = vllm_model.model.generate(inputs, + sampling_params=sampling_params) + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + assert re.fullmatch(".*", generated_text) is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/tests/e2e/singlecard/test_ilama_lora.py b/tests/e2e/singlecard/test_ilama_lora.py new file mode 100644 index 0000000..499e46f --- /dev/null +++ b/tests/e2e/singlecard/test_ilama_lora.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +import vllm +from modelscope import snapshot_download # type: ignore +from vllm.lora.request import LoRARequest + +from tests.e2e.conftest import VllmRunner + +MODEL_PATH = "vllm-ascend/ilama-3.2-1B" + +PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 + +EXPECTED_LORA_OUTPUT = [ + "SELECT count(*) FROM singer", + "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 + "SELECT DISTINCT Country FROM singer WHERE Age > 20", +] + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: + prompts = [ + PROMPT_TEMPLATE.format(query="How many singers do we have?"), + PROMPT_TEMPLATE.format( + query= + "What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 + ), + PROMPT_TEMPLATE.format( + query= + "What are all distinct countries where singers above age 20 are from?" # noqa: E501 + ), + ] + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: list[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def test_ilama_lora(ilama_lora_files): + with VllmRunner(snapshot_download(MODEL_PATH), + enable_lora=True, + dtype="half", + max_loras=4, + max_model_len=1024, + max_num_seqs=16, + enforce_eager=True) as vllm_model: + + output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] + + output2 = do_sample(vllm_model.model, ilama_lora_files, lora_id=2) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/e2e/singlecard/test_profile_execute_duration.py b/tests/e2e/singlecard/test_profile_execute_duration.py new file mode 100644 index 0000000..465db7d --- /dev/null +++ b/tests/e2e/singlecard/test_profile_execute_duration.py @@ -0,0 +1,71 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import gc +import os +import time +from unittest.mock import patch + +import torch +import vllm # noqa: F401 + +from vllm_ascend.utils import ProfileExecuteDuration + + +@patch.dict(os.environ, {"VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": "1"}) +def test_execue_duration_enabled_discrepancy(): + a = torch.randn(10000, 10000).npu() + b = torch.randn(10000, 10000).npu() + + # warmup + torch.matmul(a, b) + torch.npu.synchronize() + + cpu_start = time.perf_counter() + with ProfileExecuteDuration().capture_async("forward"): + torch.matmul(a, b) + torch.npu.synchronize() + cpu_duration = (time.perf_counter() - cpu_start) * 1000 + npu_durations = ProfileExecuteDuration().pop_captured_sync() + assert npu_durations and 'forward' in npu_durations + assert not ProfileExecuteDuration._observations + + # Assert discrepancy between CPU and NPU duration is within 50% roughly + diff = abs(cpu_duration - npu_durations['forward']) / max( + cpu_duration, npu_durations['forward']) + assert diff <= 0.5, ( + f"CPU={cpu_duration:.2f}ms, NPU={npu_durations['forward']:.2f}ms") + + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + +def test_execue_duration_disabled(): + a = torch.randn(100, 100).npu() + b = torch.randn(100, 100).npu() + + with ProfileExecuteDuration().capture_async("forward"): + torch.matmul(a, b) + torch.npu.synchronize() + npu_durations = ProfileExecuteDuration().pop_captured_sync() + assert not npu_durations + + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() diff --git a/tests/e2e/singlecard/test_quantization.py b/tests/e2e/singlecard/test_quantization.py new file mode 100644 index 0000000..4ec3198 --- /dev/null +++ b/tests/e2e/singlecard/test_quantization.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from modelscope import snapshot_download # type: ignore[import-untyped] + +from tests.e2e.conftest import VllmRunner + + +def test_quant_W8A8(): + max_tokens = 5 + example_prompts = [ + "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs." + ] + with VllmRunner( + snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"), + max_model_len=8192, + enforce_eager=True, + gpu_memory_utilization=0.7, + quantization="ascend", + ) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/e2e/singlecard/test_sampler.py b/tests/e2e/singlecard/test_sampler.py new file mode 100644 index 0000000..424343b --- /dev/null +++ b/tests/e2e/singlecard/test_sampler.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from vllm import SamplingParams + +from tests.e2e.conftest import VllmRunner + + +def test_models_topk() -> None: + example_prompts = [ + "Hello, my name is", + ] + sampling_params = SamplingParams(max_tokens=5, + temperature=0.0, + top_k=50, + top_p=0.9) + + with VllmRunner("Qwen/Qwen3-0.6B", + max_model_len=8192, + gpu_memory_utilization=0.7) as runner: + runner.generate(example_prompts, sampling_params) + + +def test_models_prompt_logprobs() -> None: + example_prompts = [ + "Hello, my name is", + ] + + with VllmRunner("Qwen/Qwen3-0.6B", + max_model_len=8192, + gpu_memory_utilization=0.7) as runner: + runner.generate_greedy_logprobs(example_prompts, + max_tokens=5, + num_logprobs=1) diff --git a/tests/e2e/singlecard/test_vlm.py b/tests/e2e/singlecard/test_vlm.py new file mode 100644 index 0000000..5fe27f6 --- /dev/null +++ b/tests/e2e/singlecard/test_vlm.py @@ -0,0 +1,89 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/test_offline_inference.py`. +""" +import os + +import pytest +from vllm import SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset + +from tests.e2e.conftest import VllmRunner + +os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256" + + +@pytest.mark.skip(reason="fix me") +def test_multimodal_vl(prompt_template): + image = ImageAsset("cherry_blossom") \ + .pil_image.convert("RGB") + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", + ] + images = [image] * len(img_questions) + prompts = prompt_template(img_questions) + with VllmRunner("Qwen/Qwen2.5-VL-3B-Instruct", + max_model_len=4096, + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + "fps": 1, + }, + enforce_eager=True) as vllm_model: + vllm_model.generate_greedy(prompts=prompts, + images=images, + max_tokens=64) + + +def test_multimodal_audio(): + audio_prompt = "".join([ + f"Audio {idx+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n" + for idx in range(2) + ]) + question = "What sport and what nursery rhyme are referenced?" + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + mm_data = { + "audio": [ + asset.audio_and_sample_rate for asset in + [AudioAsset("mary_had_lamb"), + AudioAsset("winning_call")] + ] + } + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + + sampling_params = SamplingParams(temperature=0.2, + max_tokens=10, + stop_token_ids=None) + + with VllmRunner("Qwen/Qwen2-Audio-7B-Instruct", + max_model_len=4096, + max_num_seqs=5, + dtype="bfloat16", + limit_mm_per_prompt={"audio": 2}, + gpu_memory_utilization=0.9) as runner: + runner.generate(inputs, sampling_params=sampling_params) diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py new file mode 100644 index 0000000..279b767 --- /dev/null +++ b/tests/e2e/utils.py @@ -0,0 +1,106 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/tests/utils.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import functools +import os +import signal +from collections.abc import Sequence +from typing import Callable + +import torch +import torch.nn.functional as F +from typing_extensions import ParamSpec + +_P = ParamSpec("_P") + + +def fork_new_process_for_each_test( + f: Callable[_P, None]) -> Callable[_P, None]: + """Decorator to fork a new process for each test function. + See https://github.com/vllm-project/vllm/issues/7053 for more details. + """ + + @functools.wraps(f) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: + # Make the process the leader of its own process group + # to avoid sending SIGTERM to the parent process + os.setpgrp() + from _pytest.outcomes import Skipped + pid = os.fork() + print(f"Fork a new process to run a test {pid}") + if pid == 0: + try: + f(*args, **kwargs) + except Skipped as e: + # convert Skipped to exit code 0 + print(str(e)) + os._exit(0) + except Exception: + import traceback + traceback.print_exc() + os._exit(1) + else: + os._exit(0) + else: + pgid = os.getpgid(pid) + _pid, _exitcode = os.waitpid(pid, 0) + # ignore SIGTERM signal itself + old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) + # kill all child processes + os.killpg(pgid, signal.SIGTERM) + # restore the signal handler + signal.signal(signal.SIGTERM, old_signal_handler) + assert _exitcode == 0, (f"function {f} failed when called with" + f" args {args} and kwargs {kwargs}") + + return wrapper + + +def matryoshka_fy(tensor: torch.Tensor, dimensions: int): + tensor = torch.tensor(tensor) + tensor = tensor[..., :dimensions] + tensor = F.normalize(tensor, p=2, dim=1) + return tensor + + +def check_embeddings_close( + *, + embeddings_0_lst: Sequence[list[float]], + embeddings_1_lst: Sequence[list[float]], + name_0: str, + name_1: str, + tol: float = 1e-3, +) -> None: + assert len(embeddings_0_lst) == len(embeddings_1_lst) + + for prompt_idx, (embeddings_0, embeddings_1) in enumerate( + zip(embeddings_0_lst, embeddings_1_lst)): + assert len(embeddings_0) == len(embeddings_1), ( + f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") + + sim = F.cosine_similarity(torch.tensor(embeddings_0), + torch.tensor(embeddings_1), + dim=0) + + fail_msg = (f"Test{prompt_idx}:" + f"\nCosine similarity: \t{sim:.4f}" + f"\n{name_0}:\t{embeddings_0[:16]!r}" + f"\n{name_1}:\t{embeddings_1[:16]!r}") + + assert sim >= 1 - tol, fail_msg diff --git a/tests/ut/__init__.py b/tests/ut/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/ut/attention/test_attention_mask.py b/tests/ut/attention/test_attention_mask.py new file mode 100644 index 0000000..a87d21b --- /dev/null +++ b/tests/ut/attention/test_attention_mask.py @@ -0,0 +1,133 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_mask import AttentionMaskBuilder + + +class TestAttentionMaskBuilder(TestBase): + + def test_init_attention_mask_builder(self): + # generate attention_mask_builder with float16 + attention_mask_builder = AttentionMaskBuilder(max_seq_len=1024, + dtype=torch.float16) + self.assertEqual(attention_mask_builder._seq_len_cached, 1024) + self.assertEqual(attention_mask_builder.attn_mask_cache.dtype, + torch.float16) + self.assertEqual(attention_mask_builder.attn_mask_cache.shape, + (1024, 1024)) + self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1], + torch.tensor(float("-inf"), dtype=torch.float16)) + + # generate attention_mask_builder with bfloat16 + attention_mask_builder = AttentionMaskBuilder(max_seq_len=2048, + dtype=torch.bfloat16) + self.assertEqual(attention_mask_builder._seq_len_cached, 2048) + self.assertEqual(attention_mask_builder.attn_mask_cache.dtype, + torch.bfloat16) + self.assertEqual(attention_mask_builder.attn_mask_cache.shape, + (2048, 2048)) + self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1], + torch.tensor(1, dtype=torch.bfloat16)) + + def test_get_mask_scale_factor(self): + # supported data types + self.assertEqual( + AttentionMaskBuilder.get_mask_scale_factor(torch.float16), 1) + self.assertEqual( + AttentionMaskBuilder.get_mask_scale_factor(torch.bfloat16), -10000) + # mask_scale_factor now only supports data types: torch.float16 and torch.bfloat16 + # Otherwise raise ValueError + with self.assertRaises(ValueError): + AttentionMaskBuilder.get_mask_scale_factor(torch.int8) + + def test_get_attn_mask(self): + # if the len is less than max_seq_len, the attn_mask_cache will not be updated + attention_mask_builder = AttentionMaskBuilder(max_seq_len=1024, + dtype=torch.float16) + attn_mask = attention_mask_builder.get_attn_mask( + max_seq_len=512, dtype=torch.float16, device=torch.device("cpu")) + self.assertEqual(attn_mask.shape, (512, 512)) + self.assertEqual(attn_mask[0][-1], + torch.tensor(float("-inf"), dtype=torch.float16)) + self.assertEqual(attention_mask_builder._seq_len_cached, 1024) + self.assertEqual(attention_mask_builder.attn_mask_cache.shape, + (1024, 1024)) + self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1], + torch.tensor(float("-inf"), dtype=torch.float16)) + + # if the len is greater than max_seq_len, the attn_mask_cache will be updated + attn_mask = attention_mask_builder.get_attn_mask( + max_seq_len=2048, dtype=torch.float16, device=torch.device("cpu")) + self.assertEqual(attn_mask.shape, (2048, 2048)) + self.assertEqual(attn_mask[0][-1], + torch.tensor(float("-inf"), dtype=torch.float16)) + self.assertEqual(attention_mask_builder._seq_len_cached, 2048) + self.assertEqual(attention_mask_builder.attn_mask_cache.shape, + (2048, 2048)) + self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1], + torch.tensor(float("-inf"), dtype=torch.float16)) + + def test_get_splitfuse_attn_mask(self): + attention_mask_builder = AttentionMaskBuilder(max_seq_len=1024, + dtype=torch.float16) + attn_mask = attention_mask_builder.get_splitfuse_attn_mask( + seq_lens=torch.tensor([10, 20, 100]), + position=torch.tensor([7, 8, 9, 18, 19, 99]), + dtype=torch.float16, + device=torch.device("cpu"), + ) + self.assertEqual(attn_mask.shape, (6, 100)) + self.assertEqual(attention_mask_builder._seq_len_cached, 1024) + + attn_mask = attention_mask_builder.get_splitfuse_attn_mask( + seq_lens=torch.tensor([10, 3000, 2000]), + position=torch.tensor([7, 8, 9, 2999, 1999]), + dtype=torch.float16, + device=torch.device("cpu"), + ) + self.assertEqual(attn_mask.shape, (5, 3000)) + self.assertEqual(attention_mask_builder._seq_len_cached, 3000) + + # splitfuse_attn_mask now only supports data types: torch.float16 and torch.bfloat16 + # otherwise raise ValueError + with self.assertRaises(ValueError): + attn_mask = attention_mask_builder.get_splitfuse_attn_mask( + seq_lens=torch.tensor([10, 20, 100]), + position=torch.tensor([7, 8, 9, 18, 19, 99]), + dtype=torch.int8, + device=torch.device("cpu"), + ) + + def test_mask_value_cleanliness(self): + attention_mask_builder = AttentionMaskBuilder(max_seq_len=6, + dtype=torch.bfloat16) + self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1], + torch.tensor(1, dtype=torch.bfloat16)) + + attn_mask = attention_mask_builder.get_splitfuse_attn_mask( + seq_lens=torch.tensor([6]), + position=torch.tensor([3, 4, 5]), + dtype=torch.bfloat16, + device=torch.device("cpu"), + ) + self.assertEqual( + attn_mask[-2][-1], + torch.tensor(-10000, dtype=torch.bfloat16, + device=attn_mask.device)) + self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1], + torch.tensor(1, dtype=torch.bfloat16)) diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py new file mode 100644 index 0000000..556c8d7 --- /dev/null +++ b/tests/ut/attention/test_attention_v1.py @@ -0,0 +1,578 @@ +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, + AscendAttentionBackendImpl, + AscendAttentionMetadataBuilder, + AscendAttentionState, + AscendMetadata, + CommonAttentionState) +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata + + +class TestAscendAttentionBackend(TestBase): + + def test_get_name(self): + self.assertEqual(AscendAttentionBackend.get_name(), "ASCEND") + + def test_get_impl_cls(self): + self.assertEqual(AscendAttentionBackend.get_impl_cls(), + AscendAttentionBackendImpl) + + def test_get_metadata_cls(self): + self.assertEqual(AscendAttentionBackend.get_metadata_cls(), + AscendMetadata) + + def test_get_state_cls(self): + self.assertEqual(AscendAttentionBackend.get_state_cls(), + CommonAttentionState) + + def test_get_builder_cls(self): + self.assertEqual(AscendAttentionBackend.get_builder_cls(), + AscendAttentionMetadataBuilder) + + @patch('vllm_ascend.attention.attention_v1.is_310p') + def test_get_kv_cache_shape_310p(self, mock_is_310p): + mock_is_310p.return_value = True + result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40) + self.assertEqual(result, (2, 10, 30 * 40 // 16, 20, 16)) + + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) + def test_get_kv_cache_shape_not_310p(self, mock_is_310p): + result = AscendAttentionBackend.get_kv_cache_shape(10, 20, 30, 40) + self.assertEqual(result, (2, 10, 20, 30, 40)) + + def test_get_bsh_kv_cache_shape(self): + result = AscendAttentionBackend.get_bsh_kv_cache_shape(10, 20, 30, 40) + self.assertEqual(result, (2, 10, 20, 30 * 40)) + + def test_swap_blocks(self): + src_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))] + dst_kv_cache = [torch.zeros((10, 20)), torch.zeros((10, 20))] + src_to_dst = torch.tensor([[0, 1], [2, 3]]) + AscendAttentionBackend.swap_blocks(src_kv_cache, dst_kv_cache, + src_to_dst) + self.assertTrue(torch.all(dst_kv_cache[0][1] == src_kv_cache[0][0])) + self.assertTrue(torch.all(dst_kv_cache[1][3] == src_kv_cache[1][2])) + + def test_copy_blocks(self): + kv_caches = [torch.zeros((10, 20)), torch.zeros((10, 20))] + src_to_dists = torch.tensor([[0, 1], [2, 3]]) + AscendAttentionBackend.copy_blocks(kv_caches, src_to_dists) + self.assertTrue(torch.all(kv_caches[0][1] == kv_caches[0][0])) + self.assertTrue(torch.all(kv_caches[1][3] == kv_caches[1][2])) + + +class TestAscendAttentionMetadataBuilder(TestBase): + + def setUp(self): + self.mock_vllm_config = MagicMock() + self.mock_vllm_config.model_config.max_model_len = 640 + self.mock_vllm_config.cache_config.block_size = 64 + self.mock_device = 'cpu:0' + self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config, + self.mock_device) + + def test_reorder_batch(self): + mock_input_batch = MagicMock() + mock_scheduler_output = MagicMock() + + result = self.builder.reorder_batch(mock_input_batch, + mock_scheduler_output) + + self.assertFalse(result) + + @patch('vllm_ascend.attention.attention_v1.AscendMetadata') + @patch('torch_npu.npu_format_cast') + @patch('vllm_ascend.utils.nd_to_nz_2d') + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) + def test_build_prefill_no_cache(self, mock_is_310p, mock_nd_to_nz_2d, + mock_npu_format_cast, + mock_ascend_metadata): + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=torch.tensor([0, 3, 7]), + query_start_loc_cpu=torch.tensor([0, 3, 7]), + seq_lens_cpu=torch.tensor([5, 6]), + num_reqs=2, + num_actual_tokens=10, + max_query_len=5, + decode_token_per_req=torch.tensor([1, 1]), + block_table_tensor=torch.zeros((10, 10)), + slot_mapping_cpu=torch.tensor(range(20)), + actual_seq_lengths_q=torch.tensor([0, 1]), + positions=torch.tensor([10, 10]), + attn_mask=torch.ones((10, 10)), + spec_attn_mask=None, + attn_state=AscendAttentionState.PrefillNoCache) + + mock_nz_tensor = MagicMock() + mock_model = MagicMock() + mock_nd_to_nz_2d.return_value = mock_nz_tensor + mock_npu_format_cast.return_value = mock_nz_tensor + + self.builder.build(common_attn_metadata, mock_model) + + @patch('vllm_ascend.attention.attention_v1.AscendMetadata') + @patch('torch_npu.npu_format_cast') + @patch('vllm_ascend.utils.nd_to_nz_spec') + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) + @patch('vllm_ascend.attention.attention_v1.AscendAttentionState') + def test_build_chunked_prefill(self, mock_ascend_attention_state, + mock_is_310p, mock_nd_to_nz_spec, + mock_npu_format_cast, mock_ascend_metadata): + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=torch.tensor([0, 2, 5, 9]), + query_start_loc_cpu=torch.tensor([0, 2, 5, 9]), + seq_lens_cpu=torch.tensor([4, 5, 6]), + num_reqs=3, + num_actual_tokens=15, + max_query_len=6, + decode_token_per_req=torch.tensor([1, 1, 1]), + block_table_tensor=torch.zeros((10, 10)), + slot_mapping_cpu=torch.tensor(range(20)), + actual_seq_lengths_q=torch.tensor([0, 1, 2]), + positions=torch.tensor([10, 10]), + attn_mask=torch.ones((15, 15)), + spec_attn_mask=None, + attn_state=AscendAttentionState.ChunkedPrefill) + + mock_ascend_attention_state = MagicMock() + mock_ascend_attention_state.PrefillNoCache = 0 + + mock_nz_tensor = MagicMock() + mock_model = MagicMock() + mock_nd_to_nz_spec.return_value = mock_nz_tensor + mock_npu_format_cast.return_value = mock_nz_tensor + + self.builder.build(common_attn_metadata, mock_model) + + @patch('vllm_ascend.attention.attention_v1.AscendMetadata') + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) + def test_build_non_310p(self, mock_is_310p, mock_ascend_metadata): + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=torch.tensor([0, 2, 5, 9]), + query_start_loc_cpu=torch.tensor([0, 2, 5, 9]), + seq_lens_cpu=torch.tensor([4, 5, 6]), + num_reqs=3, + num_actual_tokens=15, + max_query_len=6, + decode_token_per_req=torch.tensor([1, 1, 1]), + block_table_tensor=torch.zeros((10, 10)), + slot_mapping_cpu=torch.tensor(range(20)), + actual_seq_lengths_q=torch.tensor([0, 1, 2]), + positions=torch.tensor([10, 10]), + attn_mask=torch.ones((15, 15)), + spec_attn_mask=None, + attn_state=AscendAttentionState.ChunkedPrefill) + mock_model = MagicMock() + + self.builder.build(common_attn_metadata, mock_model) + + +class TestAscendAttentionBackendImpl(TestBase): + + def setUp(self): + self.layer = MagicMock() + self.layer.layer_name = "test_layer" + self.layer._k_scale_float = 1.0 + self.layer._v_scale_float = 1.0 + + self.attention_type = MagicMock() + self.attention_type.DECODER = "decoder" + self.attention_type.ENCODER = "encoder" + + self.attn_metadata = MagicMock() + self.attn_metadata.return_value = "1" + + self.layer_no_quant = MagicMock( + spec=['layer_name', '_k_scale_float', '_v_scale_float']) + self.layer_no_quant.layer_name = "test_layer" + self.layer_no_quant._k_scale_float = 1.0 + self.layer_no_quant._v_scale_float = 1.0 + + self.impl = AscendAttentionBackendImpl( + num_heads=8, + head_size=64, + scale=1.0, + num_kv_heads=8, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="float16", + logits_soft_cap=None, + attn_type=self.attention_type.DECODER, + kv_sharing_target_layer_name=None) + + self.impl_192 = AscendAttentionBackendImpl( + num_heads=8, + head_size=192, + scale=1.0, + num_kv_heads=8, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="float16", + logits_soft_cap=None, + attn_type=self.attention_type.DECODER, + kv_sharing_target_layer_name=None) + + self.impl_error = AscendAttentionBackendImpl( + num_heads=8, + head_size=192, + scale=1.0, + num_kv_heads=8, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype="float16", + logits_soft_cap=None, + attn_type=None, + kv_sharing_target_layer_name=None) + + self.impl_swa = AscendAttentionBackendImpl( + num_heads=8, + head_size=64, + scale=1.0, + num_kv_heads=8, + alibi_slopes=None, + sliding_window=1024, + kv_cache_dtype="float16", + logits_soft_cap=None, + attn_type=self.attention_type.DECODER, + kv_sharing_target_layer_name=None) + + @patch('torch.ops.vllm.unified_ascend_attention_with_output') + def test_forward_trace_flag_true(self, mock_unified_attention): + """Test forward pass when trace_flag is True""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 0, 0, 8, 64) + metadata = self.attn_metadata + layer = self.layer + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=True) + + mock_unified_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_paged_attention_splitfuse') + def test_forward_with_quant_method(self, mock_paged_attention): + """Test forward pass when layer has quant_method""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + k_cache = torch.ones(1, 10, 8, 64, dtype=torch.int8) + v_cache = torch.ones(1, 10, 8, 64, dtype=torch.int8) + kv_cache = [k_cache, v_cache] + ret_value = torch.ones(1, 1, 10, 8, 64, dtype=torch.int8) + + metadata = MagicMock() + metadata.num_actual_tokens = torch.randn(10, 8 * 64) + metadata.block_tables = torch.randn(10, 8 * 64) + metadata.seq_lens = torch.randn(10, 8 * 64) + metadata.attn_mask = torch.randn(10, 8 * 64) + metadata.query_lens = torch.randn(10, 8 * 64) + layer = self.layer + layer.quant_method = MagicMock() + layer.quant_method.apply.return_value = ret_value + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + layer.quant_method.apply.assert_called_once() + assert output.shape == (10, 8 * 64) + + def test_forward_no_attn_metadata(self): + """Test forward pass when attn_metadata is None""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 0, 0, 8, 64) + layer = self.layer_no_quant + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + None, + trace_flag=False) + + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_flash_attention') + def test_forward_prefill_no_cache(self, mock_flash_attention, + mock_reshape_cache): + """Test forward pass in PrefillNoCache state""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.PrefillNoCache + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.seq_lens = torch.tensor([10]) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + # layer.quant_method.apply.return_value = metadata + print(self.layer_no_quant._v_scale_float) + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_reshape_cache.assert_called_once() + mock_flash_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_flash_attention') + def test_forward_prefill_no_cache_swa(self, mock_flash_attention, + mock_reshape_cache): + """Test forward pass in PrefillNoCache state""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.PrefillNoCache + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.seq_lens = torch.tensor([10]) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + # layer.quant_method.apply.return_value = metadata + print(self.layer_no_quant._v_scale_float) + output = self.impl_swa.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_reshape_cache.assert_called_once() + mock_flash_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_flash_attention_qlens') + def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens, + mock_npu_reshape_and_cache): + """Test forward pass in PrefillCacheHit state""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.PrefillCacheHit + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.query_lens = torch.tensor([10]) + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_flash_attention_qlens.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_paged_attention') + def test_forward_decode_only(self, mock_paged_attention, + mock_npu_reshape_and_cache): + """Test forward pass in DecodeOnly state""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.DecodeOnly + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_paged_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu.npu_fused_infer_attention_score') + def test_forward_decode_only_swa(self, mock_fused_infer_attention_score, + mock_npu_reshape_and_cache): + """Test forward pass in DecodeOnly state""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_state = AscendAttentionState.DecodeOnly + metadata.seq_lens = torch.tensor([10] * 10) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 100 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + mock_fused_infer_attention_score.return_value = (torch.ones(10, 8, + 64), 1) + output = self.impl_swa.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + print(output.shape) + mock_fused_infer_attention_score.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) + @patch('torch_npu._npu_reshape_and_cache') + @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill') + def test_forward_head_size_192(self, mock_vanilla_prefill, + mock_npu_reshape_and_cache, mock_is_310p): + """Test forward pass when head_size is 192""" + + self.impl.head_size = 192 + query = torch.randn(10, 8 * 192) + key = torch.randn(10, 8 * 192) + value = torch.randn(10, 8 * 192) + kv_cache = torch.empty(2, 5, 128, 8, 192) + metadata = self.attn_metadata + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.query_lens = torch.tensor([10]) + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + mock_vanilla_prefill.return_value = MagicMock() + + output = self.impl_192.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_vanilla_prefill.assert_called_once() + assert output.shape == (10, 8 * 192) + + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_paged_attention_splitfuse') + def test_forward_normal_v1_situation(self, mock_paged_attention, + mock_npu_reshape_and_cache): + """Test forward pass in normal V1 situation""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.query_lens = torch.tensor([10]) + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_paged_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu.npu_format_cast') + @patch('torch_npu._npu_reshape_and_cache') + @patch('torch_npu._npu_paged_attention_splitfuse') + @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) + def test_forward_310p_device(self, mock_is_310p, mock_paged_attention, + mock_npu_reshape_and_cache, + mock_npu_format_cast): + """Test forward pass on 310P device""" + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.query_lens = torch.tensor([10]) + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + + mock_npu_format_cast.return_value = metadata.attn_mask + output = self.impl.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) + + mock_paged_attention.assert_called_once() + assert output.shape == (10, 8 * 64) + + @patch('torch_npu._npu_reshape_and_cache') + def test_forward_raise_error(self, mock_paged_attention): + query = torch.randn(10, 8 * 64) + key = torch.randn(10, 8 * 64) + value = torch.randn(10, 8 * 64) + kv_cache = torch.empty(2, 5, 128, 8, 64) + metadata = self.attn_metadata + metadata.attn_mask = torch.randn(1, 1, 10, 10) + metadata.query_lens = torch.tensor([10]) + metadata.seq_lens = torch.tensor([10]) + metadata.block_tables = torch.zeros(1, 5, dtype=torch.long) + metadata.num_actual_tokens = 10 + metadata.slot_mapping = torch.zeros(10, dtype=torch.long) + layer = self.layer_no_quant + + with self.assertRaises(NotImplementedError): + self.impl_error.forward(layer, + query, + key, + value, + kv_cache, + metadata, + trace_flag=False) diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py new file mode 100644 index 0000000..6360504 --- /dev/null +++ b/tests/ut/attention/test_mla_v1.py @@ -0,0 +1,631 @@ +from unittest.mock import MagicMock, patch + +import torch +from vllm.distributed.parallel_state import GroupCoordinator +from vllm.model_executor.layers.linear import LinearBase + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.mla_v1 import (AscendMLABackend, + AscendMLADecodeMetadata, + AscendMLAImpl, AscendMLAMetadata, + AscendMLAMetadataBuilder, + AscendMLAPrefillMetadata) + + +class TestAscendMLABackend(TestBase): + + def test_get_name(self): + self.assertEqual(AscendMLABackend.get_name(), "ASCEND_MLA") + + def test_get_metadata_cls(self): + self.assertEqual(AscendMLABackend.get_metadata_cls(), + AscendMLAMetadata) + + def test_get_builder_cls(self): + self.assertEqual(AscendMLABackend.get_builder_cls(), + AscendMLAMetadataBuilder) + + def test_get_kv_cache_shape(self): + result = AscendMLABackend.get_kv_cache_shape(2, 4, 8, 128) + self.assertEqual(result, (2, 4, 8, 128)) + + def test_get_impl_cls(self): + result = AscendMLABackend.get_impl_cls() + self.assertEqual(result, AscendMLAImpl) + + +class TestAscendMLAPrefillMetadata(TestBase): + + def test_ascend_mla_prefill_metadata_default(self): + attn_mask = torch.tensor([[1, 0], [1, 1]], dtype=torch.bool) + query_lens = [1, 2] + seq_lens = [2, 2] + context_lens = torch.tensor([1, 2]) + input_positions = torch.tensor([0, 1, 0, 1]) + query_start_loc = torch.tensor([0, 1, 3]) + block_table = torch.tensor([[0, 1], [2, 3]]) + max_query_len = 2 + max_seq_lens = 2 + + metadata = AscendMLAPrefillMetadata(attn_mask=attn_mask, + query_lens=query_lens, + seq_lens=seq_lens, + context_lens=context_lens, + input_positions=input_positions, + query_start_loc=query_start_loc, + block_table=block_table, + max_query_len=max_query_len, + max_seq_lens=max_seq_lens) + self.assertIs(metadata.attn_mask, attn_mask) + self.assertEqual(metadata.query_lens, query_lens) + self.assertEqual(metadata.seq_lens, seq_lens) + self.assertIs(metadata.context_lens, context_lens) + self.assertIs(metadata.input_positions, input_positions) + self.assertIs(metadata.query_start_loc, query_start_loc) + self.assertIs(metadata.block_table, block_table) + self.assertEqual(metadata.max_query_len, max_query_len) + self.assertEqual(metadata.max_seq_lens, max_seq_lens) + self.assertIsNone(metadata.chunked_context) + + def test_ascend_mla_prefill_metadata_with_chunked_context(self): + cu_seq_lens = torch.tensor([0, 2, 4]) + starts = torch.tensor([0, 2]) + seq_tot = [2, 2] + max_seq_lens = [2, 2] + workspace = torch.randn(2, 4) + chunk_seq_lens = torch.tensor([2, 2]) + + chunked_context = AscendMLAPrefillMetadata.ChunkedContextMetadata( + cu_seq_lens=cu_seq_lens, + starts=starts, + seq_tot=seq_tot, + max_seq_lens=max_seq_lens, + workspace=workspace, + chunk_seq_lens=chunk_seq_lens) + + metadata = AscendMLAPrefillMetadata( + attn_mask=torch.tensor([[1, 0], [1, 1]], dtype=torch.bool), + query_lens=[1, 2], + seq_lens=[2, 2], + context_lens=torch.tensor([1, 2]), + input_positions=torch.tensor([0, 1, 0, 1]), + query_start_loc=torch.tensor([0, 1, 3]), + block_table=torch.tensor([[0, 1], [2, 3]]), + max_query_len=2, + max_seq_lens=2, + chunked_context=chunked_context) + + self.assertIsNotNone(metadata.chunked_context) + self.assertIs(metadata.chunked_context.cu_seq_lens, cu_seq_lens) + self.assertIs(metadata.chunked_context.starts, starts) + self.assertEqual(metadata.chunked_context.seq_tot, seq_tot) + self.assertEqual(metadata.chunked_context.max_seq_lens, max_seq_lens) + self.assertIs(metadata.chunked_context.workspace, workspace) + self.assertIs(metadata.chunked_context.chunk_seq_lens, chunk_seq_lens) + + +class TestAscendMLADecodeMetadata(TestBase): + + def test_ascend_mla_decode_metadata_default(self): + input_positions = torch.tensor([[1, 2, 3, 4], [1, 2, 3, 4]]) + block_table = torch.tensor([[0, 3, 2, 1], [0, 2, 1, 3]]) + seq_lens = torch.tensor([[2], [3]]) + max_seq_lens = 4 + seq_lens_list = [2, 3] + attn_mask = None + + metadata = AscendMLADecodeMetadata(input_positions, block_table, + seq_lens, max_seq_lens, + seq_lens_list, attn_mask) + + self.assertIs(metadata.input_positions, input_positions) + self.assertIs(metadata.block_table, block_table) + self.assertIs(metadata.seq_lens, seq_lens) + self.assertEqual(metadata.max_seq_lens, max_seq_lens) + self.assertEqual(metadata.seq_lens_list, seq_lens_list) + self.assertIsNone(attn_mask) + + +class TestAscendMLAMetadata(TestBase): + + def test_ascend_mla_metadata_default(self): + num_actual_tokens = 100 + slot_mapping = torch.randn(100, 4, 1024) + query_start_loc = torch.tensor([1, 2, 3, 4]) + seq_lens = [30, 50] + block_tables = torch.randint(0, 100, (100, 4)) + + num_decodes = 4 + num_decode_tokens = 8 + num_prefills = 8 + + num_input_tokens = 2 + + query_lens = None + head_dim = None + attn_mask = None + attn_state = AscendAttentionState.ChunkedPrefill + + decode = None + prefill = None + + metadata = AscendMLAMetadata(num_actual_tokens, slot_mapping, + query_start_loc, seq_lens, block_tables, + num_decodes, num_decode_tokens, + num_prefills, num_input_tokens, + query_lens, head_dim, attn_mask, + attn_state, decode, prefill) + + self.assertEqual(metadata.num_actual_tokens, num_actual_tokens) + self.assertIs(metadata.slot_mapping, slot_mapping) + self.assertIs(metadata.query_start_loc, query_start_loc) + self.assertEqual(metadata.seq_lens, seq_lens) + self.assertIs(metadata.block_tables, block_tables) + self.assertEqual(metadata.num_decodes, num_decodes) + self.assertEqual(metadata.num_decode_tokens, num_decode_tokens) + self.assertEqual(metadata.num_prefills, num_prefills) + self.assertEqual(metadata.num_input_tokens, num_input_tokens) + self.assertEqual(metadata.query_lens, query_lens) + self.assertEqual(metadata.head_dim, head_dim) + self.assertEqual(metadata.attn_mask, attn_mask) + self.assertEqual(metadata.attn_state, attn_state) + self.assertEqual(metadata.decode, decode) + self.assertEqual(metadata.prefill, prefill) + + +class TestAscendMLAMetadataBuilder(TestBase): + + def test_ascend_mla_metadata_builder_default(self): + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.model_config.get_head_size.return_value = 64 + mock_vllm_config.model_config.dtype = torch.float16 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.max_num_seqs = 4 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + ascend_config = MagicMock() + with patch("vllm_ascend.attention.mla_v1.get_ascend_config", + return_value=ascend_config): + builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device) + + self.assertEqual(builder.block_size, + mock_vllm_config.cache_config.block_size) + self.assertEqual( + builder.chunked_prefill_enabled, + mock_vllm_config.scheduler_config.chunked_prefill_enabled) + + def test_reorder_batch(self): + ascend_config = MagicMock() + + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.max_num_seqs = 4 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + with patch("vllm_ascend.attention.mla_v1.get_ascend_config", + return_value=ascend_config): + builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device) + builder.decode_threshold = 1 + + input_batch = MagicMock() + input_batch.req_ids = [0, 1, 2, 3] + + scheduler_output = MagicMock() + scheduler_output.num_scheduled_tokens = {0: 1, 1: 3, 2: 1, 3: 2} + scheduler_output.scheduled_spec_decode_tokens = { + 0: [], + 1: [1], + 2: [], + 3: [] + } + + input_batch.swap_states = MagicMock() + + modified = builder.reorder_batch(input_batch, scheduler_output) + + self.assertTrue(modified) + input_batch.swap_states.assert_called_once_with(1, 2) + + +class TestAscendMLAImpl(TestBase): + + @patch('vllm.distributed.parallel_state._TP', + new_callable=lambda: MagicMock(spec=GroupCoordinator)) + @patch("vllm.distributed.get_tensor_model_parallel_world_size", + return_value=2) + @patch("vllm_ascend.attention.mla_v1.get_current_vllm_config") + @patch("vllm_ascend.attention.mla_v1.get_ascend_config") + def setUp(self, ascend_config, get_current_vllm_config, mock_get_tp_size, + mock_tp): + mock_tp.world_size = 2 + vllm_config = MagicMock() + speculative_config = MagicMock() + model_config = MagicMock() + speculative_config.num_speculative_tokens = 4 + vllm_config.speculative_config = speculative_config + model_config.dtype = torch.float16 + vllm_config.model_config = model_config + get_current_vllm_config.return_value = vllm_config + + num_heads = 256 + head_size = 1024 + scale = 0.1 + num_kv_heads = 8 + kv_cache_dtype = "auto" + + kv_a_layernorm = MagicMock() + kv_a_layernorm.weight = torch.randn(96) + kv_a_layernorm.variance_epsilon = 1e-6 + kwargs = { + "q_lora_rank": 64, + "kv_lora_rank": 32, + "qk_nope_head_dim": 64, + "qk_rope_head_dim": 32, + "qk_head_dim": 96, + "v_head_dim": 128, + "rotary_emb": MagicMock(), + "q_proj": MagicMock(), + "kv_b_proj": MagicMock(), + "o_proj": MagicMock(), + "kv_a_proj_with_mqa": MagicMock(), + "kv_a_layernorm": kv_a_layernorm, + } + + self.impl = AscendMLAImpl(num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=kv_cache_dtype, + blocksparse_params=None, + logits_soft_cap=None, + attn_type=None, + kv_sharing_target_layer_name=None, + **kwargs) + + def test_init(self): + self.assertEqual(self.impl.num_heads, 256) + self.assertEqual(self.impl.head_size, 1024) + self.assertEqual(self.impl.scale, 0.1) + self.assertEqual(self.impl.num_kv_heads, 8) + self.assertEqual(self.impl.kv_cache_dtype, "auto") + self.assertEqual(self.impl.q_lora_rank, 64) + self.assertEqual(self.impl.kv_lora_rank, 32) + self.assertEqual(self.impl.qk_nope_head_dim, 64) + self.assertEqual(self.impl.qk_rope_head_dim, 32) + self.assertEqual(self.impl.qk_head_dim, 96) + self.assertEqual(self.impl.v_head_dim, 128) + self.assertIsNotNone(self.impl.rotary_emb) + self.assertIsNotNone(self.impl.q_proj) + self.assertIsNotNone(self.impl.kv_b_proj) + self.assertIsNotNone(self.impl.o_proj) + self.assertIsNotNone(self.impl.kv_a_proj_with_mqa) + self.assertIsNotNone(self.impl.kv_a_layernorm) + self.assertEqual(self.impl.num_queries_per_kv, 32) + self.assertEqual(self.impl.tp_size, 2) + + def test_v_up_proj(self): + batch_size = 4 + x = torch.randn(batch_size, self.impl.num_heads, + self.impl.kv_lora_rank) + + if not hasattr(self.impl, 'W_UV') or self.impl.W_UV is None: + self.impl.W_UV = torch.randn(self.impl.num_heads, + self.impl.kv_lora_rank, + self.impl.v_head_dim) + result = self.impl._v_up_proj(x) + + self.assertEqual(result.shape[0], batch_size) + self.assertEqual(result.shape[1], + self.impl.num_heads * self.impl.v_head_dim) + + def test_q_proj_and_k_up_proj(self): + batch_size = 4 + x = torch.randn(batch_size, self.impl.num_heads, self.impl.qk_head_dim) + q_proj_output = torch.randn(batch_size, self.impl.num_heads, + self.impl.qk_head_dim) + self.impl.q_proj.return_value = (q_proj_output, ) + if not hasattr(self.impl, 'W_UK_T') or self.impl.W_UK_T is None: + self.impl.W_UK_T = torch.randn(self.impl.num_heads, + self.impl.qk_nope_head_dim, + self.impl.kv_lora_rank) + result = self.impl._q_proj_and_k_up_proj(x) + ql_nope, q_pe = result + self.assertEqual(ql_nope.shape[0], batch_size) + self.assertEqual(ql_nope.shape[1], self.impl.num_heads) + self.assertEqual(ql_nope.shape[2], self.impl.kv_lora_rank) + self.assertEqual(q_pe.shape[0], batch_size) + self.assertEqual(q_pe.shape[1], self.impl.num_heads) + self.assertEqual(q_pe.shape[2], self.impl.qk_rope_head_dim) + + def test_process_weights_after_loading(self): + layer = MagicMock(spec=LinearBase) + layer.input_size_per_partition = 10 + quant_method = MagicMock() + apply = MagicMock() + quant_method.apply = apply + layer.quant_method = quant_method + shape_0 = self.impl.num_heads * (self.impl.qk_nope_head_dim + + self.impl.v_head_dim) + shape_1 = self.impl.kv_lora_rank + layer.weight = torch.randn(shape_0, shape_1) + self.impl.kv_b_proj = layer + apply.return_value = layer.weight.T + self.impl.process_weights_after_loading(torch.bfloat16) + + self.assertEqual(self.impl.W_UK_T.shape[0], self.impl.num_heads) + self.assertEqual(self.impl.W_UK_T.shape[1], self.impl.qk_nope_head_dim) + self.assertEqual(self.impl.W_UK_T.shape[2], self.impl.kv_lora_rank) + + self.assertEqual(self.impl.W_UV.shape[0], self.impl.num_heads) + self.assertEqual(self.impl.W_UV.shape[1], self.impl.kv_lora_rank) + self.assertEqual(self.impl.W_UV.shape[2], self.impl.v_head_dim) + + def test_compute_prefill_context_none(self): + batch_size = 4 + kv_cache = torch.randn(10, 1, 1, 192) + query = torch.randn(batch_size, self.impl.num_heads, + self.impl.qk_head_dim) + metadata = MagicMock() + metadata.prefill = None + prefix_out = torch.randn(2, 16, 128) + prefix_lse = torch.randn(2, 16, 8) + q_pe = query[..., self.impl.qk_nope_head_dim:] + q_nope = query[..., :self.impl.qk_nope_head_dim] + + out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache, + 32, metadata, prefix_out, + prefix_lse) + + self.assertTrue(torch.equal(prefix_out, out)) + self.assertTrue(torch.equal(prefix_lse, lse)) + + @patch("torch_npu.atb.npu_paged_cache_load") + @patch("torch_npu.atb.npu_ring_mla") + def test_compute_prefill_context(self, mock_ring, mock_load): + S, N, D, VD = 2, self.impl.num_heads, self.impl.qk_head_dim, self.impl.v_head_dim + _, AND = self.impl.qk_rope_head_dim, self.impl.qk_nope_head_dim + latent_kv_dim = self.impl.kv_lora_rank + num_blocks, block_size = 100, 20 + query = torch.randn(S, N, D) + q_nope = query[..., :self.impl.qk_nope_head_dim] + q_pe = query[..., self.impl.qk_nope_head_dim:] + kv_cache_0 = torch.randn(num_blocks, block_size, N, latent_kv_dim) + kv_cache_1 = torch.randn(num_blocks, block_size, N, D) + kv_cache = [kv_cache_0, kv_cache_1] + prefix_out = torch.randn(S, N, 128) + prefix_lse = torch.randn(S, N) + + self.impl.kv_b_proj.return_value = (torch.randn(8, N, VD + AND), ) + + chunk_ctx = MagicMock() + chunk_ctx.seq_tot = [8] + chunk_ctx.chunk_seq_lens = [torch.tensor([8])] + chunk_ctx.starts = [torch.tensor([0])] + + prefill_meta = MagicMock() + prefill_meta.chunked_context = chunk_ctx + prefill_meta.query_lens = [8] + prefill_meta.block_table = torch.randint(0, 100, (S, 4)) + + meta = MagicMock() + meta.prefill = prefill_meta + self.impl.prefill_mask = torch.triu( + torch.ones(512, 512, device=q_nope.device, dtype=q_nope.dtype), 1) + + out, lse = self.impl._compute_prefill_context(q_nope, q_pe, kv_cache, + 32, meta, prefix_out, + prefix_lse) + + mock_load.assert_called_once() + mock_ring.assert_called_once() + + self.assertEqual(out.shape, prefix_out.shape) + self.assertEqual(lse.shape, prefix_lse.shape) + + @patch("vllm_ascend.attention.mla_v1.AscendMLAImpl._v_up_proj") + @patch("torch_npu.npu_fused_infer_attention_score") + def test_forward_decode_without_graph(self, + mock_npu_fused_infer_attention_score, + mock_up_proj): + num_tokens = 100 + block_size = 4 + q_nope = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_nope_head_dim) + q_pe = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_rope_head_dim) + k_nope = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_nope_head_dim) + k_pe = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_rope_head_dim) + metadata = MagicMock() + metadata.decode = MagicMock() + metadata.decode.block_table = MagicMock() + metadata.decode.seq_lens = 10 + mock_npu_fused_infer_attention_score.return_value = [ + torch.randn(num_tokens, self.impl.num_heads, + self.impl.kv_lora_rank), None + ] + mock_up_proj.return_value = torch.randn(num_tokens, + self.impl.num_heads, + self.impl.v_head_dim) + result = self.impl._forward_decode(q_nope, q_pe, k_nope, k_pe, + block_size, metadata) + self.assertEqual(result.shape[0], num_tokens) + self.assertEqual(result.shape[1], self.impl.num_heads) + self.assertEqual(result.shape[2], self.impl.v_head_dim) + mock_up_proj.assert_called_once() + mock_npu_fused_infer_attention_score.assert_called_once() + + @patch("vllm_ascend.attention.mla_v1.npu_prefetch") + def test_mla_preprocess(self, magic_npu_fetch): + magic_npu_fetch.return_value = MagicMock() + batch_size = 4 + seq_len = 8 + hidden_size = 1024 + hidden_states = torch.randn(batch_size * seq_len, hidden_size) + + kv_cache = MagicMock() + + attn_metadata = MagicMock() + attn_metadata.num_decodes = 2 + attn_metadata.num_prefills = 2 + attn_metadata.num_decode_tokens = 2 + attn_metadata.num_actual_tokens = 4 + num_prefill_tokens = 2 + attn_metadata.slot_mapping = torch.arange(4) + attn_metadata.decode.cos = torch.randn(2, 64) + attn_metadata.decode.sin = torch.randn(2, 64) + attn_metadata.prefill.cos = torch.randn(2, 64) + attn_metadata.prefill.sin = torch.randn(2, 64) + + self.impl.q_a_proj = MagicMock() + self.impl.q_a_layernorm = MagicMock() + self.impl.q_a_layernorm.return_value = torch.randn( + attn_metadata.num_actual_tokens, self.impl.num_heads, + self.impl.qk_rope_head_dim) + self.impl.kv_a_proj_with_mqa = MagicMock() + self.impl.kv_a_proj_with_mqa.return_value = [ + torch.randn(num_prefill_tokens, self.impl.num_heads, + self.impl.qk_nope_head_dim + self.impl.kv_lora_rank) + ] + self.impl.q_proj = MagicMock() + self.impl.q_proj.return_value = [ + torch.randn(num_prefill_tokens, self.impl.num_heads, + self.impl.qk_head_dim) + ] + self.impl.kv_b_proj = MagicMock() + self.impl.kv_b_proj.return_value = [ + torch.randn(num_prefill_tokens, self.impl.num_heads, + self.impl.v_head_dim + self.impl.qk_nope_head_dim) + ] + self.impl.rope_single = MagicMock(side_effect=lambda x, cos, sin: x) + self.impl.exec_kv_decode = MagicMock() + self.impl.exec_kv_decode.return_value = [MagicMock(), MagicMock()] + self.impl.exec_kv_prefill = MagicMock() + self.impl.exec_kv_prefill.return_value = [ + torch.randn(num_prefill_tokens, self.impl.num_heads, + self.impl.qk_rope_head_dim), + torch.randn(num_prefill_tokens, self.impl.num_heads, + self.impl.kv_lora_rank) + ] + self.impl._q_proj_and_k_up_proj = MagicMock() + self.impl._q_proj_and_k_up_proj.return_value = [ + MagicMock(), MagicMock() + ] + self.impl.num_kv_heads = self.impl.num_heads + + decode_res, prefill_res = self.impl._mla_preprocess( + hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False) + + self.assertIsNotNone(decode_res) + self.assertIsNotNone(prefill_res) + + @patch("torch_npu.npu_kv_rmsnorm_rope_cache") + def test_exec_kv_prefill(self, mock_kv_rmsnorm_rope_cache): + B = 2 + N = self.impl.num_kv_heads + D = self.impl.kv_lora_rank + self.impl.qk_rope_head_dim + kv_no_split = torch.randn(B, N, D) + self.impl.enable_kv_nz = None + self.impl.kv_a_layernorm.weight = MagicMock() + self.impl.kv_a_layernorm.variance_epsilon = MagicMock() + cos = MagicMock() + sin = MagicMock() + slots = MagicMock() + kv_cache = [MagicMock(), MagicMock()] + + mock_kv_rmsnorm_rope_cache.return_value = [ + None, None, + torch.randn(B, N, 1, self.impl.qk_rope_head_dim), + torch.randn(B, N, 1, self.impl.kv_lora_rank) + ] + + k_pe, k_nope = self.impl.exec_kv_prefill(kv_no_split, cos, sin, + kv_cache, slots) + + self.assertEqual(k_pe.shape[-1], self.impl.qk_rope_head_dim) + self.assertEqual(k_nope.shape[-1], self.impl.kv_lora_rank) + + @patch("torch_npu.npu_kv_rmsnorm_rope_cache") + def test_exec_kv_decode(self, mock_kv_rmsnorm_rope_cache): + B = 2 + N = self.impl.num_kv_heads + D = self.impl.kv_lora_rank + self.impl.qk_rope_head_dim + kv_no_split = torch.randn(B, N, D) + self.impl.enable_kv_nz = None + self.impl.kv_a_layernorm.weight = MagicMock() + self.impl.kv_a_layernorm.variance_epsilon = MagicMock() + cos = MagicMock() + sin = MagicMock() + slots = MagicMock() + kv_cache = [MagicMock(), MagicMock()] + + mock_kv_rmsnorm_rope_cache.return_value = [ + torch.randn(B, N, 1, self.impl.qk_rope_head_dim), + torch.randn(B, N, 1, self.impl.kv_lora_rank), None, None + ] + + k_pe, k_nope = self.impl.exec_kv_decode(kv_no_split, cos, sin, + kv_cache, slots) + + self.assertEqual(k_pe.shape[-1], self.impl.qk_rope_head_dim) + self.assertEqual(k_nope.shape[-1], self.impl.kv_lora_rank) + + @patch("torch.npu.stream") + @patch("vllm_ascend.attention.mla_v1.get_multistream_comm_context") + @patch("torch_npu.npu_fused_infer_attention_score") + def test_forward_decode(self, mock_npu_fused_infer_attention_score, + mock_get_multistream_comm_context, + mock_npu_stream): + B = 2 + N = self.impl.num_kv_heads + BS = 100 + HD = self.impl.v_head_dim + self.impl.kv_lora_rank = 256 + self.impl.spec_token_num = 1 + self.impl._v_up_proj = MagicMock() + self.impl._v_up_proj.return_value = torch.randn(B, N, HD) + q_nope = torch.randn(B, N, self.impl.qk_nope_head_dim) + q_pe = torch.randn(B, N, self.impl.qk_rope_head_dim) + k_nope = torch.randn(BS, N, self.impl.kv_lora_rank) + k_pe = torch.randn(BS, N, self.impl.qk_rope_head_dim) + attn_metadata = MagicMock() + attn_metadata.attn_state = AscendAttentionState.SpecDecoding + attn_metadata.decode = MagicMock() + attn_metadata.decode.actual_seq_lengths_q = MagicMock() + attn_metadata.decode.seq_lens_list = MagicMock() + self.impl.enable_kv_nz = True + + mock_npu_fused_infer_attention_score.return_value = [ + torch.randn(B, N, self.impl.kv_lora_rank), None + ] + mock_get_multistream_comm_context.return_value = None + + result = self.impl._forward_decode(q_nope, q_pe, k_nope, k_pe, BS, + attn_metadata) + + self.assertEqual(result.shape[0], B) + self.assertEqual(result.shape[1], N) + self.assertEqual(result.shape[2], HD) + + self.impl.enable_kv_nz = False + attn_metadata.attn_state = None + mock_return_value = MagicMock() + mock_get_multistream_comm_context.return_value = mock_return_value + mock_return_value.before_comm_event = MagicMock() + mock_return_value.comm_stream = MagicMock() + mock_npu_stream.return_value = MagicMock() + + result = self.impl._forward_decode(q_nope, q_pe, k_nope, k_pe, BS, + attn_metadata) + + self.assertEqual(result.shape[0], B) + self.assertEqual(result.shape[1], N) + self.assertEqual(result.shape[2], HD) diff --git a/tests/ut/base.py b/tests/ut/base.py new file mode 100644 index 0000000..36583a5 --- /dev/null +++ b/tests/ut/base.py @@ -0,0 +1,44 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import unittest + +import pytest + +from vllm_ascend.utils import adapt_patch, register_ascend_customop + + +class TestBase(unittest.TestCase): + + def __init__(self, *args, **kwargs): + # adapt patch by default. + adapt_patch(True) + adapt_patch() + register_ascend_customop() + super().setUp() + super(TestBase, self).__init__(*args, **kwargs) + + +class PytestBase: + """Base class for pytest-based tests. + because pytest mocker and parametrize usage are not compatible with unittest. + so we need to use a separate base class for pytest tests. + """ + + @pytest.fixture(autouse=True) + def setup(self): + adapt_patch(True) + adapt_patch() + register_ascend_customop() diff --git a/tests/ut/conftest.py b/tests/ut/conftest.py new file mode 100644 index 0000000..799edc6 --- /dev/null +++ b/tests/ut/conftest.py @@ -0,0 +1,26 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from vllm_ascend.utils import adapt_patch # noqa E402 +from vllm_ascend.utils import register_ascend_customop + +adapt_patch() +adapt_patch(True) + +# register Ascend CustomOp here because uts will use this +register_ascend_customop() diff --git a/tests/ut/core/test_schedule_config.py b/tests/ut/core/test_schedule_config.py new file mode 100644 index 0000000..df36b52 --- /dev/null +++ b/tests/ut/core/test_schedule_config.py @@ -0,0 +1,167 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vllm.config import SchedulerConfig + +from tests.ut.base import TestBase +from vllm_ascend.core.schedule_config import AscendSchedulerConfig + + +class TestAscendSchedulerConfig(TestBase): + + def setUp(self): + self.basic_scheduler_config = SchedulerConfig( + max_num_batched_tokens=8192, + max_model_len=8192, + is_multimodal_model=False, + send_delta_data=False, + scheduler_delay_factor=0, + ) + + def test_initialize_from_config_with_default(self): + # No additional config given, check the default value here. + ascend_config = AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, {}) + self.assertEqual(ascend_config.enable_chunked_prefill, False) + self.assertEqual(ascend_config.policy, "fcfs") + self.assertEqual(ascend_config.num_scheduler_steps, 1) + self.assertEqual(ascend_config.scheduler_cls, + "vllm_ascend.core.scheduler.AscendScheduler") + self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192) + self.assertEqual(ascend_config.encoder_cache_size, 8192) + + def test_initialize_from_config_with_override(self): + # test override + ascend_config = AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + enable_chunked_prefill=False, + policy="fcfs", + num_scheduler_steps=1, + scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler", + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) + self.assertEqual(ascend_config.enable_chunked_prefill, False) + self.assertEqual(ascend_config.policy, "fcfs") + self.assertEqual(ascend_config.num_scheduler_steps, 1) + self.assertEqual(ascend_config.scheduler_cls, + "vllm_ascend.core.scheduler.AscendScheduler") + self.assertEqual(ascend_config.max_num_batched_tokens, 2048) + self.assertEqual(ascend_config.encoder_cache_size, 2048) + + def test_not_implemented_policy(self): + with self.assertRaises(NotImplementedError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + policy="custom_policy", + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) + self.assertIn( + "currently AscendScheduler only supports fcfs policy", + str(context.exception), + ) + + def test_not_implemented_multimodal(self): + with self.assertRaises(NotImplementedError) as context: + AscendSchedulerConfig.initialize_from_config( + SchedulerConfig(is_multimodal_model=True), {}) + self.assertIn("currently AscendScheduler only supports LLM models", + str(context.exception)) + + def test_not_implemented_multi_step(self): + with self.assertRaises(NotImplementedError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + num_scheduler_steps=2, + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) + self.assertIn( + "currently AscendScheduler doesn't support multi-step", + str(context.exception), + ) + + def test_not_implemented_send_delta_data(self): + with self.assertRaises(NotImplementedError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + send_delta_data=True, + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) + self.assertIn( + "currently AscendScheduler doesn't support send_delta_data", + str(context.exception), + ) + + def test_not_implemented_delay_factor(self): + with self.assertRaises(NotImplementedError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + delay_factor=1, + max_num_batched_tokens=2048, + max_model_len=2048, + ), + ) + self.assertIn( + "currently AscendScheduler doesn't support scheduler_delay_factor", + str(context.exception), + ) + + def test_no_override(self): + ascend_config = AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, {}) + self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192) + self.assertEqual(ascend_config.encoder_cache_size, 8192) + + def test_valid_config_with_chunked_prefill(self): + ascend_config = AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + enable_chunked_prefill=True, + max_num_batched_tokens=2048, + max_model_len=4096, + ), + ) + self.assertEqual(ascend_config.max_num_batched_tokens, 2048) + self.assertEqual(ascend_config.max_model_len, 4096) + self.assertTrue(ascend_config.enable_chunked_prefill) + + def test_invalid_config_without_chunked_prefill(self): + with self.assertRaises(ValueError) as context: + AscendSchedulerConfig.initialize_from_config( + self.basic_scheduler_config, + AscendSchedulerConfig( + enable_chunked_prefill=False, + max_num_batched_tokens=2048, + max_model_len=4096, + ), + ) + self.assertIn( + "Ascend scheduler is enabled without chunked prefill feature", + str(context.exception), + ) + self.assertIn("max_num_batched_tokens (2048)", str(context.exception)) + self.assertIn("max_model_len (4096)", str(context.exception)) diff --git a/tests/ut/core/test_scheduler.py b/tests/ut/core/test_scheduler.py new file mode 100644 index 0000000..1855c80 --- /dev/null +++ b/tests/ut/core/test_scheduler.py @@ -0,0 +1,898 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Dict, List, Optional, Tuple +from unittest.mock import MagicMock, patch + +import torch +from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig, + SchedulerConfig, SpeculativeConfig, VllmConfig) +from vllm.multimodal.inputs import PlaceholderRange +from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus +from vllm.v1.structured_output import StructuredOutputManager + +from tests.ut.base import TestBase +from vllm_ascend.core.scheduler import AscendScheduler +from vllm_ascend.utils import vllm_version_is + +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + +EOS_TOKEN_ID = 50256 +MODEL = "Qwen3-0.6B" +ENABLE_PREFIX_CACHING = None +PROMPT_LOGPROBS = None +ENABLE_CHUNKED_PREFILL = False +MAX_NUM_BATCHED_TOKENS = 10000 +LONG_PREFILL_TOKEN_THRESHOLD = 0 +NUM_SPECULATIVE_TOKENS = None +MAX_NUM_SEQS = 16 + + +def create_requests( + num_requests: int, + num_tokens: int = 10, + mm_positions: Optional[list[PlaceholderRange]] = None, + max_tokens: int = 16, + stop_token_ids: Optional[list[int]] = None, + block_size: int = 3, + hash_fn=hash, +): + init_none_hash(hash_fn) + prompt_logprobs = PROMPT_LOGPROBS + sampling_params = SamplingParams(ignore_eos=False, + max_tokens=max_tokens, + stop_token_ids=stop_token_ids, + prompt_logprobs=prompt_logprobs) + requests = [] + for i in range(num_requests): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + request = Request(request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + multi_modal_kwargs=None, + multi_modal_placeholders=None, + multi_modal_hashes=None, + eos_token_id=EOS_TOKEN_ID, + pooling_params=None, + block_hasher=get_request_block_hasher( + block_size, hash_fn)) + else: + request = Request(request_id=f"{i}", + prompt_token_ids=[i] * num_tokens, + sampling_params=sampling_params, + eos_token_id=EOS_TOKEN_ID, + pooling_params=None, + block_hasher=get_request_block_hasher( + block_size, hash_fn)) + requests.append(request) + return requests + + +def make_output(scheduler): + req_ids = [req.request_id for req in scheduler.running] + req_id_to_index = { + req.request_id: i + for i, req in enumerate(scheduler.running) + } + sampled_token_ids = [[1000]] * len(scheduler.running) + logprobs = None + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + else: + modelrunner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=logprobs, + prompt_logprobs_dict={}, + pooler_output=[], + ) + return modelrunner_output + + +class TestAscendScheduler(TestBase): + + @patch("vllm.config.ModelConfig.__post_init__", MagicMock()) + @patch("vllm.config.VllmConfig.__post_init__", MagicMock()) + @patch('vllm.v1.core.sched.scheduler.compute_encoder_budget') + def create_scheduler(self, mock_compute_encoder_budget): + mock_compute_encoder_budget.return_value = [10, 20] + use_kv_connector = False + block_size = 16 + + scheduler_config = SchedulerConfig( + max_num_seqs=16, + max_model_len=MAX_NUM_BATCHED_TOKENS, + long_prefill_token_threshold=LONG_PREFILL_TOKEN_THRESHOLD, + disable_chunked_mm_input=False, + enable_chunked_prefill=ENABLE_CHUNKED_PREFILL, + max_num_batched_tokens=MAX_NUM_BATCHED_TOKENS, + ) + + scheduler_config.max_num_encoder_input_tokens = 10000 + scheduler_config.encoder_cache_size = 10000 + scheduler_config.chunked_prefill_enabled = False + + model_config = ModelConfig( + model=MODEL, + task="auto", + tokenizer=MODEL, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="float16", + seed=42, + max_model_len=MAX_NUM_BATCHED_TOKENS, + ) + model_config.pooler_config = MagicMock() + model_config.multimodal_config = MagicMock() + model_config.hf_config = MagicMock() + model_config.hf_config.is_encoder_decoder = False + # Cache config, optionally force APC + kwargs_cache: Dict[str, + Any] = ({} if ENABLE_PREFIX_CACHING is None else { + 'enable_prefix_caching': + ENABLE_PREFIX_CACHING + }) + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + **kwargs_cache, + ) + + kv_transfer_config = KVTransferConfig( + kv_connector="SharedStorageConnector", + kv_role="kv_both", + kv_connector_extra_config={"shared_storage_path": "local_storage"}, + ) if use_kv_connector else None + + speculative_config: Optional[SpeculativeConfig] = None + if NUM_SPECULATIVE_TOKENS is not None: + speculative_config = SpeculativeConfig( + model="ngram", num_speculative_tokens=NUM_SPECULATIVE_TOKENS) + + vllm_config = VllmConfig( + scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + speculative_config=speculative_config, + ) + + kv_cache_config = KVCacheConfig( + num_blocks=10000, # A large number of blocks to hold all requests + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, + torch.float32, False)) + ], + ) + cache_config.num_gpu_blocks = 10000 + + scheduler = AscendScheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=MagicMock(spec=StructuredOutputManager), + ) + + should_advance = MagicMock() + should_advance.return_value = False + scheduler.structured_output_manager.should_advance = should_advance + + return scheduler + + def test_add_requests(self): + scheduler = self.create_scheduler() + requests = create_requests(num_requests=10) + + for i, request in enumerate(requests): + scheduler.add_request(request) + self.assertIn(request.request_id, scheduler.requests) + self.assertEqual(len(scheduler.waiting), i + 1) + + def test_finish_request(self): + scheduler = self.create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_ABORTED) + self.assertNotIn(request.request_id, scheduler.requests) + self.assertEqual(len(scheduler.waiting), 9 - i) + + def test_get_num_unfinished_requests(self): + scheduler = self.create_scheduler() + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + for i, request in enumerate(requests): + scheduler.finish_requests(request.request_id, + RequestStatus.FINISHED_STOPPED) + self.assertEqual(scheduler.get_num_unfinished_requests(), + len(requests) - i - 1) + + def test_schedule(self): + '''Test scheduling. + Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs + ''' + scheduler = self.create_scheduler() + scheduler.scheduler_config.chunked_prefill_enabled = False + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + self.assertEqual(len(output.scheduled_new_reqs), len(requests)) + self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0) + self.assertEqual(len(output.finished_req_ids), 0) + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + self.assertEqual(num_tokens, + len(requests[int(req_id)].prompt_token_ids)) + + # Verify requests moved from waiting to running + self.assertEqual(len(scheduler.waiting), 0) + self.assertEqual(len(scheduler.running), len(requests)) + for i, request in enumerate(requests): + self.assertEqual(scheduler.running[i], request) + + def test_schedule_enable_prefix_caching(self): + '''Test scheduling. + Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs + ''' + global ENABLE_PREFIX_CACHING + ENABLE_PREFIX_CACHING = True + global PROMPT_LOGPROBS + PROMPT_LOGPROBS = 5 + scheduler = self.create_scheduler() + scheduler.scheduler_config.chunked_prefill_enabled = False + requests = create_requests(num_requests=10) + for request in requests: + scheduler.add_request(request) + + # Test initial scheduling + output = scheduler.schedule() + self.assertEqual(len(output.scheduled_new_reqs), len(requests)) + self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0) + self.assertEqual(len(output.finished_req_ids), 0) + # Verify all requests are scheduled. + for req_id, num_tokens in output.num_scheduled_tokens.items(): + self.assertEqual(num_tokens, + len(requests[int(req_id)].prompt_token_ids)) + + # Verify requests moved from waiting to running + self.assertEqual(len(scheduler.waiting), 0) + self.assertEqual(len(scheduler.running), len(requests)) + for i, request in enumerate(requests): + self.assertEqual(scheduler.running[i], request) + + def test_stop_via_update_from_output(self): + """Test stopping behavior through update_from_output""" + global NUM_SPECULATIVE_TOKENS + NUM_SPECULATIVE_TOKENS = 1 + scheduler = self.create_scheduler() + + # Test case 1: Stop on EOS token + requests = create_requests(num_requests=2, max_tokens=10) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + req.status = RequestStatus.RUNNING + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 1, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [], + requests[1].request_id: [10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 1, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [], + requests[1].request_id: [10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[EOS_TOKEN_ID], [ + 10, 11 + ]], # First request hits EOS, second continues + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped, second continues + self.assertEqual(len(scheduler.running), 1) + self.assertEqual(scheduler.running[0].request_id, + requests[1].request_id) + self.assertEqual(requests[0].status, RequestStatus.FINISHED_STOPPED) + self.assertIn(requests[0].request_id, scheduler.finished_req_ids) + self.assertEqual(list(requests[0].output_token_ids), [EOS_TOKEN_ID]) + self.assertEqual(list(requests[1].output_token_ids), [10, 11]) + + # Test case 2: Stop on custom stop token + NUM_SPECULATIVE_TOKENS = 2 + scheduler = self.create_scheduler() + requests = create_requests(num_requests=2, + max_tokens=10, + stop_token_ids=[42, 43]) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + req.status = RequestStatus.RUNNING + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=5, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 42], + requests[1].request_id: [13] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 2 + }, + total_num_scheduled_tokens=5, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 42], + requests[1].request_id: [13] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 42, 12], + [13, 14]], # First request hits stop token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped on custom token + self.assertEqual(len(scheduler.running), 1) + self.assertEqual(scheduler.running[0].request_id, + requests[1].request_id) + self.assertEqual(requests[0].status, RequestStatus.FINISHED_STOPPED) + self.assertEqual(requests[0].stop_reason, 42) + self.assertIn(requests[0].request_id, scheduler.finished_req_ids) + self.assertEqual(list(requests[0].output_token_ids), [10, 42]) + self.assertEqual(list(requests[1].output_token_ids), [13, 14]) + + # Test case 3: Stop on max tokens + NUM_SPECULATIVE_TOKENS = 2 + scheduler = self.create_scheduler() + requests = create_requests(num_requests=2, max_tokens=2) + for req in requests: + req.num_computed_tokens = req.num_tokens + scheduler.requests[req.request_id] = req + scheduler.running.append(req) + req.status = RequestStatus.RUNNING + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 1 + }, + total_num_scheduled_tokens=4, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 11], + requests[1].request_id: [] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={ + requests[0].request_id: 3, + requests[1].request_id: 1 + }, + total_num_scheduled_tokens=4, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [10, 11], + requests[1].request_id: [] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[req.request_id for req in requests], + req_id_to_index={ + req.request_id: i + for i, req in enumerate(requests) + }, + sampled_token_ids=[[10, 11, 12], + [13]], # First request exceeds max_tokens + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + scheduler.update_from_output(scheduler_output, model_output) + + # Verify first request stopped due to length + self.assertEqual(len(scheduler.running), 1) + self.assertEqual(scheduler.running[0].request_id, + requests[1].request_id) + self.assertEqual(requests[0].status, + RequestStatus.FINISHED_LENGTH_CAPPED) + self.assertIn(requests[0].request_id, scheduler.finished_req_ids) + self.assertEqual(list(requests[0].output_token_ids), [10, 11]) + self.assertEqual(list(requests[1].output_token_ids), [13]) + + # Test case 4: Ignore EOS flag + scheduler = self.create_scheduler() + requests = create_requests(num_requests=1, max_tokens=10) + requests[0].sampling_params.ignore_eos = True + requests[0].num_computed_tokens = requests[0].num_tokens + scheduler.requests[requests[0].request_id] = requests[0] + scheduler.running.append(requests[0]) + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={requests[0].request_id: 3}, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [EOS_TOKEN_ID, 10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_input_ids=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + else: + scheduler_output = SchedulerOutput( + scheduled_new_reqs=[], + scheduled_cached_reqs=[], + num_scheduled_tokens={requests[0].request_id: 3}, + total_num_scheduled_tokens=3, + scheduled_encoder_inputs={}, + scheduled_spec_decode_tokens={ + requests[0].request_id: [EOS_TOKEN_ID, 10] + }, + num_common_prefix_blocks=0, + finished_req_ids=set(), + free_encoder_mm_hashes=[], + structured_output_request_ids={}, + grammar_bitmask=None) + model_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + scheduler.update_from_output(scheduler_output, model_output) + + # Verify request continues past EOS + self.assertEqual(len(scheduler.running), 1) + self.assertFalse(requests[0].is_finished()) + self.assertEqual(list(requests[0].output_token_ids), + [EOS_TOKEN_ID, 10, 11]) + + def test_schedule_concurrent_batches(self): + global MAX_NUM_BATCHED_TOKENS + global ENABLE_PREFIX_CACHING + global ENABLE_CHUNKED_PREFILL + global MAX_NUM_SEQS + global PROMPT_LOGPROBS + ENABLE_PREFIX_CACHING = None + MAX_NUM_BATCHED_TOKENS = 1024 + MAX_NUM_SEQS = 2 + ENABLE_CHUNKED_PREFILL = True + PROMPT_LOGPROBS = None + + enable_prefix_caching_list = [None, True] + prompt_logprobs_list = [None, 5] + + for i in range(len(enable_prefix_caching_list)): + ENABLE_PREFIX_CACHING = enable_prefix_caching_list[i] + PROMPT_LOGPROBS = prompt_logprobs_list[i] + scheduler = self.create_scheduler() + requests = create_requests( + num_requests=2, + num_tokens=512, + ) + + # Schedule the first request. + scheduler.add_request(requests[0]) + scheduler_output0 = scheduler.schedule() + self.assertEqual(len(scheduler_output0.scheduled_new_reqs), 1) + self.assertEqual( + scheduler_output0.num_scheduled_tokens[requests[0].request_id], + 512) + + # The first request is still running, so only schedule the second request. + scheduler.add_request(requests[1]) + scheduler_output1 = scheduler.schedule() + self.assertEqual(len(scheduler_output1.scheduled_new_reqs), 1) + self.assertEqual( + scheduler_output1.num_scheduled_tokens[requests[1].request_id], + 512) + + # Model output of the first request. + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[0].request_id], + req_id_to_index={requests[0].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + scheduler.update_from_output(scheduler_output0, + model_runner_output) + + # Schedule the next step. + # The first request can be scheduled again while the second + # request is still running. + scheduler.schedule() + # Model output of the second request. + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=[requests[1].request_id], + req_id_to_index={requests[1].request_id: 0}, + sampled_token_ids=[[0]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + scheduler.update_from_output(scheduler_output1, + model_runner_output) + + def test_schedule_spec_decoding_stats(self): + """Test scheduling behavior with speculative decoding. + + This test verifies that: + 1. Speculated tokens get scheduled correctly + 2. Spec decoding stats properly count number of draft and accepted tokens + """ + spec_tokens_list: List[List[List[int]]] = [[[1, 2, 3]], [[1, 2, 3]], + [[1, 2], [3]], [[1]], [[]], + [[1, 2, 3], [4, 5, 6]]] + output_tokens_list: List[List[List[int]]] = [[[1, 2, 3, 4]], [[1, 5]], + [[1, 2, 5], [3, 4]], + [[1, 2]], [[5]], + [[1, 2, 7], [4, 8]]] + expected_list: List[Tuple[int, int, + int, List[int]]] = [(1, 3, 3, [1, 1, 1]), + (1, 3, 1, [1, 0, 0]), + (2, 3, 3, [2, 1]), + (1, 1, 1, [1]), + (0, 0, 0, [0]), + (2, 6, 3, [2, 1, 0])] + + global NUM_SPECULATIVE_TOKENS + for idx in range(len(spec_tokens_list)): + spec_tokens = spec_tokens_list[idx] + output_tokens = output_tokens_list[idx] + expected = expected_list[idx] + num_spec_tokens = max(1, max(len(t) for t in spec_tokens)) + NUM_SPECULATIVE_TOKENS = num_spec_tokens + scheduler = self.create_scheduler() + requests = create_requests(num_requests=len(spec_tokens), + num_tokens=1) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i + + # Schedule a decode, which will also draft speculative tokens + output = scheduler.schedule() + self.assertEqual(len(output.scheduled_new_reqs), len(requests)) + self.assertEqual(output.total_num_scheduled_tokens, len(requests)) + for i in range(len(requests)): + req_id = requests[i].request_id + self.assertEqual(output.num_scheduled_tokens[req_id], 1) + self.assertNotIn(req_id, output.scheduled_spec_decode_tokens) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + spec_token_ids=spec_tokens, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=[[0] for _ in range(len(requests))], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + draft_token_ids = DraftTokenIds(req_ids, spec_tokens) + + engine_core_outputs = scheduler.update_from_output( + output, model_runner_output) + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + scheduler.update_draft_token_ids(draft_token_ids) + + for i in range(len(requests)): + running_req = scheduler.running[i] + # The prompt token + self.assertEqual(running_req.num_computed_tokens, 1) + # The prompt token and the sampled token + self.assertEqual(running_req.num_tokens, 2) + # The prompt token, the sampled token, and the speculated tokens + self.assertEqual(running_req.num_tokens_with_spec, + 2 + len(spec_tokens[i])) + + # No draft or accepted tokens counted yet + self.assertTrue( + not engine_core_outputs + or (engine_core_outputs[0].scheduler_stats.spec_decoding_stats + is None)) + + # Schedule the speculated tokens for validation + output = scheduler.schedule() + self.assertEqual(len(output.scheduled_new_reqs), 0) + # The sampled token and speculated tokens + self.assertEqual( + output.total_num_scheduled_tokens, + len(requests) + sum(len(ids) for ids in spec_tokens)) + for i in range(len(requests)): + req_id = requests[i].request_id + self.assertEqual(output.num_scheduled_tokens[req_id], + 1 + len(spec_tokens[i])) + if spec_tokens[i]: + self.assertEqual( + len(output.scheduled_spec_decode_tokens[req_id]), + len(spec_tokens[i])) + else: + self.assertNotIn(req_id, + output.scheduled_spec_decode_tokens) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_to_index, + sampled_token_ids=output_tokens, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[]) + + engine_core_outputs = scheduler.update_from_output( + output, model_runner_output) + + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None + if expected[0] == 0: + self.assertIsNone(scheduler_stats.spec_decoding_stats) + else: + self.assertIsNotNone(scheduler_stats.spec_decoding_stats) + stats = scheduler_stats.spec_decoding_stats + self.assertEqual(stats.num_drafts, expected[0]) + self.assertEqual(stats.num_draft_tokens, expected[1]) + self.assertEqual(stats.num_accepted_tokens, expected[2]) + self.assertEqual(stats.num_accepted_tokens_per_pos, + expected[3]) + + def assert_scheduler_empty(self, scheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + scheduler = self.create_scheduler() + self.assertEqual(len(scheduler.requests), 0) + self.assertEqual(len(scheduler.waiting), 0) + self.assertEqual(len(scheduler.running), 0) + self.assertEqual(len(scheduler.finished_req_ids), 0) + + # EncoderCacheManager. + self.assertEqual(len(scheduler.encoder_cache_manager.freed), 0) + self.assertEqual(len(scheduler.encoder_cache_manager.cached), 0) + + # KVCache Manager. + self.assertEqual( + len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + req_to_blocks), 0) + self.assertEqual( + len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block), 0) + num_free_blocks = (scheduler.kv_cache_manager.block_pool. + free_block_queue.num_free_blocks) + self.assertEqual( + num_free_blocks, + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + self.assertEqual(block.ref_cnt, 0) + + def test_memory_leak(self): + """Test that we do not have a memory leak.""" + scheduler = self.create_scheduler() + NUM_REQUESTS = 5 + NUM_TOKENS = 10 + MAX_TOKENS = 10 + requests = create_requests(num_requests=NUM_REQUESTS, + num_tokens=NUM_TOKENS, + max_tokens=MAX_TOKENS) + + # Add each request. + for request in requests: + scheduler.add_request(request) + scheduler_output = scheduler.schedule() + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Iterate until done. + while True: + scheduler_output = scheduler.schedule() + if len(scheduler.running) == 0: + break + model_runner_output = make_output(scheduler) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm no memory leak. + self.assert_scheduler_empty(scheduler) diff --git a/tests/ut/device_allocator/test_camem.py b/tests/ut/device_allocator/test_camem.py new file mode 100644 index 0000000..ec500e7 --- /dev/null +++ b/tests/ut/device_allocator/test_camem.py @@ -0,0 +1,188 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from tests.ut.base import PytestBase +from vllm_ascend.device_allocator.camem import (AllocationData, CaMemAllocator, + create_and_map, + find_loaded_library, + get_pluggable_allocator, + unmap_and_release) + + +def dummy_malloc(args): + pass + + +def dummy_free(ptr): + return (0, 0, 0, 0) + + +class TestCaMem(PytestBase): + + def test_find_loaded_library_success_and_not_found(self): + path = find_loaded_library("libc") + assert path is not None, "Expected to find libc library" + assert path.endswith(".so.6") or ".so" in path + assert "libc" in path + + path = find_loaded_library("non_existent_library") + assert path is None, "Expected to not find non-existent library" + + @pytest.mark.parametrize("handle", [ + (1, 2, 3), + ("device", 99), + (None, ), + ]) + def test_create_and_map_calls_python_create_and_map(self, handle): + with patch("vllm_ascend.device_allocator.camem.python_create_and_map" + ) as mock_create: + create_and_map(handle) + mock_create.assert_called_once_with(*handle) + + @pytest.mark.parametrize("handle", [ + (42, "bar"), + ("foo", ), + ]) + def test_unmap_and_release_calls_python_unmap_and_release(self, handle): + with patch( + "vllm_ascend.device_allocator.camem.python_unmap_and_release" + ) as mock_release: + unmap_and_release(handle) + mock_release.assert_called_once_with(*handle) + + @patch("vllm_ascend.device_allocator.camem.init_module") + @patch( + "vllm_ascend.device_allocator.camem.torch.npu.memory.NPUPluggableAllocator" + ) + def test_get_pluggable_allocator(self, mock_allocator_class, + mock_init_module): + mock_allocator_instance = MagicMock() + mock_allocator_class.return_value = mock_allocator_instance + + def side_effect_malloc_and_free(malloc_fn, free_fn): + malloc_fn((1, 2, 3)) + free_fn(123) + + mock_init_module.side_effect = side_effect_malloc_and_free + + allocator = get_pluggable_allocator(dummy_malloc, dummy_free) + mock_init_module.assert_called_once_with(dummy_malloc, dummy_free) + assert allocator == mock_allocator_instance + + def test_singleton_behavior(self): + instance1 = CaMemAllocator.get_instance() + instance2 = CaMemAllocator.get_instance() + assert instance1 is instance2 + + def test_python_malloc_and_free_callback(self): + allocator = CaMemAllocator.get_instance() + + # mock allocation_handle + handle = (1, 100, 1234, 0) + allocator.current_tag = "test_tag" + + allocator.python_malloc_callback(handle) + # check pointer_to_data store data + ptr = handle[2] + assert ptr in allocator.pointer_to_data + data = allocator.pointer_to_data[ptr] + assert data.handle == handle + assert data.tag == "test_tag" + + # check free callback with cpu_backup_tensor + data.cpu_backup_tensor = torch.zeros(1) + result_handle = allocator.python_free_callback(ptr) + assert result_handle == handle + assert ptr not in allocator.pointer_to_data + assert data.cpu_backup_tensor is None + + @patch("vllm_ascend.device_allocator.camem.unmap_and_release") + @patch("vllm_ascend.device_allocator.camem.memcpy") + def test_sleep_offload_and_discard(self, mock_memcpy, mock_unmap): + allocator = CaMemAllocator.get_instance() + + # prepare allocation, one tag match,one not match + handle1 = (1, 10, 1000, 0) + data1 = AllocationData(handle1, "tag1") + handle2 = (2, 20, 2000, 0) + data2 = AllocationData(handle2, "tag2") + allocator.pointer_to_data = { + 1000: data1, + 2000: data2, + } + + # mock is_pin_memory_available, return False as some machine only has cpu + with patch( + "vllm_ascend.device_allocator.camem.NPUPlatform.is_pin_memory_available", + return_value=False): + allocator.sleep(offload_tags="tag1") + + # only offload tag1, other tag2 call unmap_and_release + assert data1.cpu_backup_tensor is not None + assert data2.cpu_backup_tensor is None + mock_unmap.assert_any_call(handle1) + mock_unmap.assert_any_call(handle2) + assert mock_unmap.call_count == 2 + assert mock_memcpy.called + + @patch("vllm_ascend.device_allocator.camem.create_and_map") + @patch("vllm_ascend.device_allocator.camem.memcpy") + def test_wake_up_loads_and_clears_cpu_backup(self, mock_memcpy, + mock_create_and_map): + allocator = CaMemAllocator.get_instance() + + handle = (1, 10, 1000, 0) + tensor = torch.zeros(5, dtype=torch.uint8) + data = AllocationData(handle, "tag1", cpu_backup_tensor=tensor) + allocator.pointer_to_data = {1000: data} + + allocator.wake_up(tags=["tag1"]) + + mock_create_and_map.assert_called_once_with(handle) + assert data.cpu_backup_tensor is None + assert mock_memcpy.called + + def test_use_memory_pool_context_manager(self): + allocator = CaMemAllocator.get_instance() + old_tag = allocator.current_tag + + # mock use_memory_pool_with_allocator + mock_ctx = MagicMock() + mock_ctx.__enter__.return_value = "data" + mock_ctx.__exit__.return_value = None + + with patch( + "vllm_ascend.device_allocator.camem.use_memory_pool_with_allocator", + return_value=mock_ctx): + with allocator.use_memory_pool(tag="my_tag"): + assert allocator.current_tag == "my_tag" + # restore old tag after context manager exits + assert allocator.current_tag == old_tag + + def test_get_current_usage(self): + allocator = CaMemAllocator.get_instance() + + allocator.pointer_to_data = { + 1: AllocationData((0, 100, 1, 0), "tag"), + 2: AllocationData((0, 200, 2, 0), "tag"), + } + + usage = allocator.get_current_usage() + assert usage == 300 diff --git a/tests/ut/distributed/device_communicators/test_pyhccl.py b/tests/ut/distributed/device_communicators/test_pyhccl.py new file mode 100644 index 0000000..16eb095 --- /dev/null +++ b/tests/ut/distributed/device_communicators/test_pyhccl.py @@ -0,0 +1,84 @@ +import os +from unittest.mock import MagicMock, patch + +from vllm.distributed.utils import StatelessProcessGroup + +from tests.ut.base import TestBase +from vllm_ascend.distributed.device_communicators.pyhccl import \ + PyHcclCommunicator + + +class MockHcclLib: + pass + + +class MockUniqueId: + pass + + +class TestPyHcclCommunicator(TestBase): + + @patch.dict(os.environ, {"RANK": "0", "WORLD_SIZE": "1"}) + def test_world_size_1_return_early(self): + comm = PyHcclCommunicator( + group=StatelessProcessGroup(0, 1, None, None), + device="npu:0", + ) + self.assertTrue(comm.disabled) + self.assertFalse(comm.available) + + @patch.dict(os.environ, {"RANK": "0", "WORLD_SIZE": "2"}) + def test_load_hccl_fail(self): + comm = PyHcclCommunicator(group=StatelessProcessGroup( + 0, 2, None, None), + device="npu:0", + library_path="/not/exist/path/libhccl.so") + self.assertTrue(comm.disabled) + + @patch( + "vllm_ascend.distributed.device_communicators.pyhccl_wrapper.HCCLLibrary", + MockHcclLib) + @patch( + "vllm_ascend.distributed.device_communicators.pyhccl_wrapper.hcclUniqueId", + MockUniqueId) + @patch("torch.npu.device") + @patch("vllm_ascend.utils.current_stream", + return_value=MagicMock(npu_stream=5678)) + def test_stateless_group(self, *_): + group = StatelessProcessGroup(rank=3, + world_size=4, + store=None, + socket=None) + + comm = PyHcclCommunicator(group=group, device=3) + + self.assertEqual(comm.rank, 3) + self.assertEqual(comm.world_size, 4) + + @patch.dict(os.environ, {"RANK": "1", "WORLD_SIZE": "2"}) + @patch( + "vllm_ascend.distributed.device_communicators.pyhccl_wrapper.HCCLLibrary", + MockHcclLib) + @patch( + "vllm_ascend.distributed.device_communicators.pyhccl_wrapper.hcclUniqueId", + MockUniqueId) + @patch("torch.distributed.is_initialized", return_value=True) + @patch("torch.distributed.get_backend", return_value="nccl") + @patch("torch.distributed.get_rank", return_value=1) + @patch("torch.distributed.get_world_size", return_value=2) + @patch("torch.distributed.get_process_group_ranks", return_value=[0, 1]) + @patch("torch.distributed.broadcast") + @patch("torch.npu.device") + @patch("vllm_ascend.utils.current_stream", + return_value=MagicMock(npu_stream=1234)) + def test_multi_gpu_pg_torch( + self, + *_, + ): + fake_pg = MagicMock() + comm = PyHcclCommunicator(group=fake_pg, device="npu:1") + + self.assertEqual(comm.rank, 1) + self.assertEqual(comm.world_size, 2) + self.assertFalse(comm.available) + self.assertTrue(comm.disabled) diff --git a/tests/ut/distributed/device_communicators/test_pyhccl_wrapper.py b/tests/ut/distributed/device_communicators/test_pyhccl_wrapper.py new file mode 100644 index 0000000..ff90512 --- /dev/null +++ b/tests/ut/distributed/device_communicators/test_pyhccl_wrapper.py @@ -0,0 +1,173 @@ +from unittest.mock import MagicMock, patch + +import torch +from torch.distributed import ReduceOp + +from tests.ut.base import TestBase +from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import ( + Function, HCCLLibrary, aclrtStream_t, buffer_type, hcclComm_t, + hcclDataType_t, hcclDataTypeEnum, hcclRedOp_t, hcclRedOpTypeEnum, + hcclResult_t, hcclUniqueId) + + +class TestHcclUniqueId(TestBase): + + def test_construct(self): + uid = hcclUniqueId() + uid.internal[0] = 12 + self.assertEqual(len(uid.internal), 4108) + self.assertEqual(uid.internal[0], 12) + + +class TestHcclDataTypeEnum(TestBase): + + def test_torch_dtype_mapping(self): + expected = { + torch.int8: hcclDataTypeEnum.hcclInt8, + torch.uint8: hcclDataTypeEnum.hcclUint8, + torch.int32: hcclDataTypeEnum.hcclInt32, + torch.int64: hcclDataTypeEnum.hcclInt64, + torch.float16: hcclDataTypeEnum.hcclFloat16, + torch.float32: hcclDataTypeEnum.hcclFloat32, + torch.float64: hcclDataTypeEnum.hcclFloat64, + torch.bfloat16: hcclDataTypeEnum.hcclBfloat16, + } + + for torch_dtype, expected_enum in expected.items(): + with self.subTest(torch_dtype=torch_dtype): + self.assertEqual(hcclDataTypeEnum.from_torch(torch_dtype), + expected_enum) + + def test_unsupported_dtype_raises(self): + with self.assertRaises(ValueError): + hcclDataTypeEnum.from_torch(torch.complex64) + + +class TestHcclRedOpTypeEnum(TestBase): + + def test_torch_reduce_op_mapping(self): + expected = { + ReduceOp.SUM: hcclRedOpTypeEnum.hcclSum, + ReduceOp.PRODUCT: hcclRedOpTypeEnum.hcclProd, + ReduceOp.MAX: hcclRedOpTypeEnum.hcclMax, + ReduceOp.MIN: hcclRedOpTypeEnum.hcclMin, + } + + for torch_op, expected_enum in expected.items(): + with self.subTest(torch_op=torch_op): + self.assertEqual(hcclRedOpTypeEnum.from_torch(torch_op), + expected_enum) + + def test_unsupported_op_raises(self): + unsupported_op = "NOT_EXIST" + with self.assertRaises(ValueError): + hcclRedOpTypeEnum.from_torch(unsupported_op) + + +class TestFunction(TestBase): + + def test_construct_with_valid_args(self): + func = Function(name="foo", restype=int, argtypes=[int, str, float]) + self.assertEqual(func.name, "foo") + self.assertIs(func.restype, int) + self.assertEqual(func.argtypes, [int, str, float]) + + +class TestHCLLLibrary(TestBase): + + def test_init_with_nonexistent_so(self): + fake_path = "/definitely/not/exist/libhccl.so" + with self.assertRaises(OSError): + HCCLLibrary(fake_path) + + def test_hccl_get_error_string(self): + lib = MagicMock(sepc=HCCLLibrary) + mock_fn = MagicMock() + mock_fn.return_value = "HCCL internal error" + lib.hcclGetErrorString = mock_fn + + result = hcclResult_t(1) + msg = lib.hcclGetErrorString(result) + self.assertEqual(msg, "HCCL internal error") + mock_fn.assert_called_once() + + def test_hccl_check(self): + lib = HCCLLibrary.__new__(HCCLLibrary) + mock_fn = MagicMock() + mock_fn.return_value = "fake error" + lib.hcclGetErrorString = mock_fn + result = hcclResult_t(123) + with self.assertRaises(RuntimeError) as cm: + lib.HCCL_CHECK(result) + + self.assertEqual(str(cm.exception), "HCCL error: fake error") + + @patch.object(HCCLLibrary, "HCCL_CHECK") + def test_hccl_get_uniqueId(self, mock_HCCL_CHECK): + lib = HCCLLibrary.__new__(HCCLLibrary) + lib._funcs = {"HcclGetRootInfo": MagicMock(return_value=0)} + unique_id = lib.hcclGetUniqueId() + self.assertIsInstance(unique_id, hcclUniqueId) + lib._funcs["HcclGetRootInfo"].assert_called_once() + mock_HCCL_CHECK.assert_called_once_with(0) + + @patch.object(HCCLLibrary, "HCCL_CHECK") + def test_hccl_comm_initRank(self, mock_hccl_check): + lib = HCCLLibrary.__new__(HCCLLibrary) + lib._funcs = {"HcclCommInitRootInfo": MagicMock(return_value=0)} + + world_size = 4 + unique_id = hcclUniqueId() + rank = 1 + + comm = lib.hcclCommInitRank(world_size, unique_id, rank) + self.assertIsInstance(comm, hcclComm_t) + lib._funcs["HcclCommInitRootInfo"].assert_called_once() + mock_hccl_check.assert_called_once_with(0) + + @patch.object(HCCLLibrary, "HCCL_CHECK") + def test_hccl_all_reduce(self, mock_hccl_check): + + lib = HCCLLibrary.__new__(HCCLLibrary) + lib._funcs = {"HcclAllReduce": MagicMock(return_value=0)} + sendbuff = buffer_type() + recvbuff = buffer_type() + count = 10 + datatype = hcclDataType_t(1) + op = hcclRedOp_t(0) + comm = hcclComm_t() + stream = aclrtStream_t() + + lib.hcclAllReduce(sendbuff, recvbuff, count, datatype, op, comm, + stream) + + lib._funcs["HcclAllReduce"].assert_called_once_with( + sendbuff, recvbuff, count, datatype, op, comm, stream) + mock_hccl_check.assert_called_once_with(0) + + @patch.object(HCCLLibrary, "HCCL_CHECK") + def test_hccl_broad_cast(self, mock_hccl_check): + + lib = HCCLLibrary.__new__(HCCLLibrary) + lib._funcs = {"HcclBroadcast": MagicMock(return_value=0)} + buff = buffer_type() + count = 10 + datatype = 1 + root = 0 + comm = hcclComm_t() + stream = aclrtStream_t() + + lib.hcclBroadcast(buff, count, datatype, root, comm, stream) + + lib._funcs["HcclBroadcast"].assert_called_once_with( + buff, count, datatype, root, comm, stream) + mock_hccl_check.assert_called_once_with(0) + + @patch.object(HCCLLibrary, "HCCL_CHECK") + def test_hcclCommDestroy_success(self, mock_hccl_check): + lib = HCCLLibrary.__new__(HCCLLibrary) + lib._funcs = {"HcclCommDestroy": MagicMock(return_value=0)} + comm = hcclComm_t() + lib.hcclCommDestroy(comm) + lib._funcs["HcclCommDestroy"].assert_called_once_with(comm) + mock_hccl_check.assert_called_once_with(0) diff --git a/tests/ut/distributed/test_communicator.py b/tests/ut/distributed/test_communicator.py new file mode 100644 index 0000000..edaae2a --- /dev/null +++ b/tests/ut/distributed/test_communicator.py @@ -0,0 +1,89 @@ +import unittest +from unittest.mock import MagicMock, patch + +import torch +import torch.distributed as dist + +from vllm_ascend.distributed.communicator import NPUCommunicator + + +class TestNPUCommunicator(unittest.TestCase): + + @patch("vllm.config.get_current_vllm_config", return_value=None) + @patch("torch.npu.current_device", return_value=MagicMock()) + @patch("torch.npu.set_device", return_value=MagicMock()) + @patch("torch.distributed.get_process_group_ranks", + return_value={ + 0: 0, + 1: 1 + }) + @patch("torch.distributed.get_group_rank", return_value={0: 0, 1: 1}) + @patch("torch.distributed.is_initialized", return_value=True) + @patch("torch.distributed.get_rank", return_value=1) + @patch("torch.distributed.is_initialized", return_value=True) + @patch("torch.distributed.get_backend", return_value="hccl") + @patch("torch.distributed.get_rank", return_value=1) + @patch("torch.distributed.get_world_size", return_value=2) + @patch("torch.distributed.get_process_group_ranks", return_value=[0, 1]) + @patch("torch.npu.device") + def test_all_to_all_with_sizes(self, *_): + + def patched_all_to_all(output_tensor_list, + input_tensor_list, + group=None, + async_op=False): + output_tensor_list[:] = ([ + torch.tensor([10, 20]), + torch.tensor([50, 60]) + ]) + + torch.distributed.all_to_all = patched_all_to_all + + scatter_sizes = [2, 2] + gather_sizes = [2, 2] + input_ = torch.tensor([10, 20, 30, 40]) + + comm = NPUCommunicator(cpu_group=dist.group.WORLD) + + output = comm.all_to_all(input_, + scatter_sizes=scatter_sizes, + gather_sizes=gather_sizes) + + assert output.tolist() == [10, 20, 50, 60] + + @patch("vllm.config.get_current_vllm_config", return_value=None) + @patch("torch.npu.current_device", return_value=MagicMock()) + @patch("torch.npu.set_device", return_value=MagicMock()) + @patch("torch.distributed.get_process_group_ranks", + return_value={ + 0: 0, + 1: 1 + }) + @patch("torch.distributed.get_group_rank", return_value={0: 0, 1: 1}) + @patch("torch.distributed.is_initialized", return_value=True) + @patch("torch.distributed.get_rank", return_value=1) + @patch("torch.distributed.is_initialized", return_value=True) + @patch("torch.distributed.get_backend", return_value="hccl") + @patch("torch.distributed.get_rank", return_value=1) + @patch("torch.distributed.get_world_size", return_value=2) + @patch("torch.distributed.get_process_group_ranks", return_value=[0, 1]) + @patch("torch.npu.device") + def test_all_to_all_without_sizes(self, *_): + + def patched_all_to_all(output_tensor_list, + input_tensor_list, + group=None, + async_op=False): + output_tensor_list[:] = ([ + torch.tensor([[10, 20]]), + torch.tensor([[50, 60]]) + ]) + + torch.distributed.all_to_all = patched_all_to_all + + input_ = torch.tensor([[10, 20], [30, 40]]) + + comm = NPUCommunicator(cpu_group=dist.group.WORLD) + output = comm.all_to_all(input_, scatter_dim=0, gather_dim=0) + + assert output.tolist() == [[10, 20], [50, 60]] diff --git a/tests/ut/distributed/test_distributed_tensor_parallel.py b/tests/ut/distributed/test_distributed_tensor_parallel.py new file mode 100644 index 0000000..48a88fa --- /dev/null +++ b/tests/ut/distributed/test_distributed_tensor_parallel.py @@ -0,0 +1,139 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +import importlib + +import pytest +import torch +from pytest_mock import MockerFixture + +from tests.ut.base import PytestBase +from vllm_ascend.distributed.tensor_parallel import ( + _gather_along_first_dim, _gather_along_last_dim, + _reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim, + all_to_all_hp2sp, all_to_all_sp2hp) + + +class TestDistributedCommunication(PytestBase): + + @pytest.fixture(autouse=True) + def context(self, mocker: MockerFixture): + mocker.patch("torch.npu.current_device", return_value="cpu") + mocker.patch("torch.distributed.get_world_size", return_value=4) + + mocker.patch("torch.distributed.get_rank", return_value=0) + + @pytest.mark.parametrize("world_size, test_tensor, expected", + [(1, torch.randn(8, 16), (8, 16)), + (4, torch.randn(8, 16), (32, 16))]) + def test_gather_along_first_dim(self, test_tensor, expected, world_size, + mocker: MockerFixture): + """test _gather_along_first_dim""" + mocker.patch("torch.distributed.get_world_size", + return_value=world_size) + + result = _gather_along_first_dim(test_tensor, mocker.MagicMock()) + + assert result.shape == expected + + @pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [ + (torch.randn(8, 16), [5, 10, 15, 2], (32, 16)), + ]) + def test_gather_along_first_dim_unequal_split(self, test_tensor, expected, + output_split_sizes, + mocker: MockerFixture): + """test _gather_along_first_dim""" + + result = _gather_along_first_dim(test_tensor, mocker.MagicMock(), + output_split_sizes) + + assert result.shape == expected + + @pytest.mark.parametrize("world_size, test_tensor, expected", + [(1, torch.randn(8, 16, 32), (8, 16, 32)), + (4, torch.randn(8, 16, 32), (8, 16, 32 * 4))]) + def test_gather_along_last_dim(self, test_tensor, expected, world_size, + mocker: MockerFixture): + """test _gather_along_last_dim""" + mocker.patch("torch.distributed.get_world_size", + return_value=world_size) + + result = _gather_along_last_dim(test_tensor, mocker.MagicMock()) + + assert result.shape == expected + + @pytest.mark.parametrize("input_shape,expected_shape", [ + ((32, 16), (8, 16)), + ((40, 10), (10, 10)), + ]) + def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape, + mocker: MockerFixture): + input_tensor = torch.randn(*input_shape) + result = _reduce_scatter_along_first_dim(input_tensor, + mocker.MagicMock()) + assert result.shape == expected_shape + + @pytest.mark.parametrize("input_shape,expected_shape", [ + ((8, 16, 32), (8, 16, 8)), + ]) + def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape, + mocker: MockerFixture): + input_tensor = torch.randn(*input_shape) + result = _reduce_scatter_along_last_dim(input_tensor, + mocker.MagicMock()) + assert result.shape == expected_shape + + @pytest.mark.parametrize("func,input_shape,expected_shape", [ + ("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32), + (8, 16, 128)), + ("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)), + ("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32), + (8, 16, 8)), + ("gather_from_sequence_parallel_region", (8, 16), (32, 16)), + ]) + def test_wrapper_functions(self, func, input_shape, expected_shape, + mocker: MockerFixture): + """test wrapper funcs""" + mod = importlib.import_module( + 'vllm_ascend.distributed.tensor_parallel') + globals = mod.__dict__ + test_func = globals[func] + input_tensor = torch.randn(*input_shape) + result = test_func(input_tensor, mocker.MagicMock()) + assert result.shape == expected_shape + + @pytest.mark.parametrize( + "input_shape,output_shape", + [ + ((8, 16), (32, 4)), # [num_tokens/TP, H] -> [num_tokens, H/TP] + ]) + def test_all_to_all_sp2hp(self, input_shape, output_shape, + mocker: MockerFixture): + input_tensor = torch.randn(*input_shape) + result = all_to_all_sp2hp(input_tensor, mocker.MagicMock()) + assert result.shape == output_shape + + @pytest.mark.parametrize( + "input_shape,output_shape", + [ + ((32, 4), (8, 16)), # [num_tokens, H/TP] -> [num_tokens/TP, H] + ]) + def test_all_to_all_hp2sp(self, input_shape, output_shape, + mocker: MockerFixture): + input_tensor = torch.randn(*input_shape) + result = all_to_all_hp2sp(input_tensor, mocker.MagicMock()) + assert result.shape == output_shape diff --git a/tests/ut/distributed/test_parallel_state.py b/tests/ut/distributed/test_parallel_state.py new file mode 100644 index 0000000..afc22c8 --- /dev/null +++ b/tests/ut/distributed/test_parallel_state.py @@ -0,0 +1,44 @@ +from unittest.mock import MagicMock, patch + +import pytest +from vllm.config import ParallelConfig + +from vllm_ascend.distributed.parallel_state import ( + _LMTP, _MC2, destroy_ascend_model_parallel, get_lmhead_tp_group, + get_mc2_group, init_ascend_model_parallel) + + +@pytest.fixture +def parallel_config(): + return ParallelConfig(data_parallel_size=2, + tensor_parallel_size=2, + pipeline_parallel_size=2) + + +@pytest.fixture +def mock_distributed(): + with patch('torch.distributed.is_initialized', return_value=True), \ + patch('torch.distributed.get_world_size', return_value=8), \ + patch('torch.distributed.get_backend', return_value='nccl'), \ + patch('vllm_ascend.distributed.parallel_state.get_world_group') as mock_group: + mock_group.return_value.local_rank = 0 + mock_group.return_value.device_group = MagicMock() + yield + + +def test_init_ascend_model_parallel(mock_distributed, parallel_config): + mock_ascend_config = MagicMock() + mock_ascend_config.lmhead_tensor_parallel_size = 2 + with patch('vllm_ascend.distributed.parallel_state.model_parallel_initialized', return_value=False), \ + patch('vllm_ascend.distributed.parallel_state.init_model_parallel_group'), \ + patch('vllm_ascend.distributed.parallel_state.get_ascend_config', return_value=mock_ascend_config): + init_ascend_model_parallel(parallel_config) + + mc2_group = get_mc2_group() + assert mc2_group is not None + lmheadtp_group = get_lmhead_tp_group() + assert lmheadtp_group is not None + + destroy_ascend_model_parallel() + assert _MC2 is None + assert _LMTP is None diff --git a/tests/ut/fake_weight/config.json b/tests/ut/fake_weight/config.json new file mode 100644 index 0000000..b3fb716 --- /dev/null +++ b/tests/ut/fake_weight/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "facebook/opt-125m", + "activation_dropout": 0.0, + "activation_function": "relu", + "architectures": [ + "OPTForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 2, + "do_layer_norm_before": true, + "dropout": 0.1, + "eos_token_id": 2, + "ffn_dim": 3072, + "hidden_size": 768, + "init_std": 0.02, + "layerdrop": 0.0, + "max_position_embeddings": 2048, + "model_type": "opt", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "prefix": "", + "torch_dtype": "float16", + "transformers_version": "4.21.0.dev0", + "use_cache": true, + "vocab_size": 50272, + "word_embed_proj_dim": 768 +} diff --git a/tests/ut/kv_connector/test_llmdatadist_connector.py b/tests/ut/kv_connector/test_llmdatadist_connector.py new file mode 100644 index 0000000..b70482f --- /dev/null +++ b/tests/ut/kv_connector/test_llmdatadist_connector.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. + +import os +import types + +from tests.ut.kv_connector.utils import (create_request, create_scheduler, + create_vllm_config) +from vllm_ascend.distributed.llmdatadist_c_mgr_connector import ( + LLMDataDistCMgrConnectorMetadata, LLMDataDistCMgrConnectorWorker, LLMRole) + + +def test_basic_inferface(): + """Unit test for basic LLMDataDistCMgrConnector interface functionality.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True) + request_id = request.request_id + + scheduler.add_request(request) + + # Remote Prefill, triggers LLMDataDistCMgrConnectorMetadata. + scheduler_output = scheduler.schedule() + kv_connector_metadata = scheduler_output.kv_connector_metadata + assert kv_connector_metadata is not None + assert isinstance(kv_connector_metadata, LLMDataDistCMgrConnectorMetadata) + + assert len(kv_connector_metadata.requests) == 1 + assert request_id in kv_connector_metadata.requests + req_meta = kv_connector_metadata.requests[request_id] + + for block_id, block in zip( + req_meta.local_block_ids, scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]): + assert block_id == block.block_id + + +def test_read_agent_metadata(): + rank_table = { + "version": + "1.2", + "server_count": + "2", + "prefill_device_list": [{ + "server_id": "192.168.1.1", + "device_id": "0", + "device_ip": "10.30.0.1", + "cluster_id": "0", + }, { + "server_id": "192.168.1.1", + "device_id": "1", + "device_ip": "10.30.0.2", + "cluster_id": "1", + }, { + "server_id": "192.168.1.2", + "device_id": "0", + "device_ip": "10.30.0.3", + "cluster_id": "2", + }, { + "server_id": "192.168.1.2", + "device_id": "1", + "device_ip": "10.30.0.4", + "cluster_id": "3", + }] + } + + def get_device_ip(worker_local_ip, worker_tp_rank, worker_visible_devices): + old_visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "") + worker = types.SimpleNamespace() + worker.local_ip = worker_local_ip + worker.tp_rank = worker_tp_rank + worker.llm_datadist_role = LLMRole.PROMPT + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = worker_visible_devices + agent_metadata = LLMDataDistCMgrConnectorWorker.read_agent_metadata( + worker, rank_table) + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = old_visible_devices + return agent_metadata.device_ip + + assert get_device_ip("192.168.1.1", 0, "0") == "10.30.0.1" + assert get_device_ip("192.168.1.1", 0, "1") == "10.30.0.2" + assert get_device_ip("192.168.1.2", 0, "0") == "10.30.0.3" + assert get_device_ip("192.168.1.2", 0, "1") == "10.30.0.4" + assert get_device_ip("192.168.1.1", 0, "0,1") == "10.30.0.1" + assert get_device_ip("192.168.1.1", 1, "0,1") == "10.30.0.2" + assert get_device_ip("192.168.1.1", 0, "") == "10.30.0.1" + assert get_device_ip("192.168.1.1", 1, "") == "10.30.0.2" diff --git a/tests/ut/kv_connector/test_mooncake_connector.py b/tests/ut/kv_connector/test_mooncake_connector.py new file mode 100644 index 0000000..0b2782d --- /dev/null +++ b/tests/ut/kv_connector/test_mooncake_connector.py @@ -0,0 +1,998 @@ +import os +import queue +import socket +import sys +import threading +import time +import types +import unittest +from collections import defaultdict, deque +from unittest.mock import MagicMock, patch + +import msgspec +import zmq +from vllm.utils import make_zmq_path + +fake_engine = types.ModuleType("mooncake.engine") +fake_engine.TransferEngine = MagicMock() # type: ignore[attr-defined] +sys.modules["mooncake.engine"] = fake_engine + +from vllm_ascend.distributed.mooncake_connector import ( # noqa: E402 + KVCacheRecvingThread, KVCacheSendingThread, KVCacheTaskTracker, + KVConnectorRole, MooncakeAgentMetadata, MooncakeConnector, + MooncakeConnectorMetadata, MooncakeConnectorScheduler, + MooncakeConnectorWorker, ReqMeta, ensure_zmq_recv, ensure_zmq_send, + group_concurrent_contiguous, string_to_int64_hash, zmq_ctx) + +GET_META_MSG = b"get_meta_msg" +DONE_RECVING_MSG = b"done_recving_msg" + + +class TestKVCacheTaskTrackerInit(unittest.TestCase): + + def test_init_basic_properties(self): + tracker = KVCacheTaskTracker() + self.assertIsInstance(tracker.done_task_lock, type(threading.Lock())) + self.assertIsInstance(tracker.finished_requests, set) + self.assertIsInstance(tracker.delayed_free_requests, deque) + + +class TestGetAndClearFinishedSingleRequests(unittest.TestCase): + + def setUp(self): + self.tracker = KVCacheTaskTracker() + self.tracker.finished_requests = set() + self.tracker.done_task_lock = threading.Lock() + + def test_empty_requests(self): + result = self.tracker.get_and_clear_finished_requests() + self.assertEqual(result, set()) + self.assertEqual(len(self.tracker.finished_requests), 0) + + def test_single_request(self): + self.tracker.finished_requests = {"req_123"} + result = self.tracker.get_and_clear_finished_requests() + self.assertEqual(result, {"req_123"}) + self.assertEqual(len(self.tracker.finished_requests), 0) + + def test_multiple_requests(self): + self.tracker.finished_requests = {"req_1", "req_2", "req_3"} + result = self.tracker.get_and_clear_finished_requests() + self.assertSetEqual(result, {"req_1", "req_2", "req_3"}) + self.assertEqual(len(self.tracker.finished_requests), 0) + + @patch("vllm_ascend.distributed.mooncake_connector.logger") + def test_concurrent_access(self, mock_logger): + from concurrent.futures import ThreadPoolExecutor + self.tracker.finished_requests = {"req_1", "req_2"} + with ThreadPoolExecutor(max_workers=3) as executor: + futures = [ + executor.submit(self.tracker.get_and_clear_finished_requests) + for _ in range(3) + ] + results = [f.result() for f in futures] + self.assertEqual(sum(1 for r in results if r), 1) + self.assertEqual(len(self.tracker.finished_requests), 0) + + +class TestKVCacheSendingThreadInit(unittest.TestCase): + + def setUp(self): + self.common_args = { + 'tp_rank': 1, + 'decode_tp_size': 4, + 'local_engine_id': 'engine_1', + 'side_channel_host': 'localhost', + 'side_channel_port': 5555, + 'metadata': MagicMock(), + 'ready_event': threading.Event() + } + self.threads = [] + + def tearDown(self): + for thread in self.threads: + if hasattr(thread, 'task_tracker') and hasattr( + thread.task_tracker, 'socket'): + thread.task_tracker.socket.close() + if hasattr(thread, 'is_alive') and thread.is_alive(): + thread.join(timeout=0.1) + + def test_thread_daemon_property(self): + thread = KVCacheSendingThread(**self.common_args) + self.threads.append(thread) + self.assertTrue(thread.daemon) + + def test_thread_name_format(self): + thread = KVCacheSendingThread(**self.common_args) + self.threads.append(thread) + self.assertEqual(thread.name, "KVCacheSendingThread") + + def test_ready_event_reference(self): + custom_event = threading.Event() + args = self.common_args.copy() + args['ready_event'] = custom_event + thread = KVCacheSendingThread(**args) + self.threads.append(thread) + self.assertIs(thread.ready_event, custom_event) + + +class TestGetAndClearFinishedRequests(unittest.TestCase): + + def setUp(self): + self.common_args = { + 'tp_rank': 1, + 'decode_tp_size': 4, + 'local_engine_id': 'engine_1', + 'side_channel_host': 'localhost', + 'side_channel_port': 5555, + 'metadata': { + "test": "metadata" + }, + 'ready_event': threading.Event() + } + self.thread = KVCacheSendingThread(**self.common_args) + + @patch.object(KVCacheTaskTracker, 'get_and_clear_finished_requests') + def test_get_and_clear_finished_requests(self, mock_get_clear): + expected_requests = {'req1', 'req2'} + mock_get_clear.return_value = expected_requests + result = self.thread.get_and_clear_finished_requests() + mock_get_clear.assert_called_once() + self.assertEqual(result, expected_requests) + + +class TestKVCacheSendingThread(unittest.TestCase): + + def test_run_handles_get_meta_and_done_recv_msgs(self): + ready_event = threading.Event() + metadata = MooncakeAgentMetadata( + engine_id="engine1", + te_rpc_port=9090, + kv_caches_base_addr=[12345678], + num_blocks=2, + ) + host = "127.0.0.1" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', 0)) + free_port = s.getsockname()[1] + + thread = KVCacheSendingThread( + tp_rank=0, + decode_tp_size=1, + local_engine_id="engine1", + side_channel_host=host, + side_channel_port=free_port, + metadata=metadata, + ready_event=ready_event, + ) + thread.start() + self.assertTrue(ready_event.wait(timeout=3), + "Server thread startup timeout") + + context = zmq.Context() # type: ignore + sock = context.socket(zmq.DEALER) # type: ignore + sock.connect(f"tcp://{host}:{free_port}") + encoder = msgspec.msgpack.Encoder() + decoder = msgspec.msgpack.Decoder(type=MooncakeAgentMetadata) + + sock.send_multipart([b"", encoder.encode((GET_META_MSG, ))]) + frames = sock.recv_multipart() + self.assertEqual(frames[0], b"") + meta = decoder.decode(frames[1]) + self.assertEqual(meta.engine_id, "engine1") + self.assertEqual(meta.kv_caches_base_addr, [12345678]) + self.assertEqual(meta.num_blocks, 2) + + req_id = "request_42" + sock.send_multipart( + [b"", encoder.encode((DONE_RECVING_MSG, req_id, 0))]) + frames = sock.recv_multipart() + self.assertEqual(frames[0], b"") + self.assertEqual(frames[1], b"ACK") + self.assertIn(req_id, thread.task_tracker.finished_requests) + + sock.close() + context.term() + + +class TestKVCacheRecvingThreadBasic(unittest.TestCase): + + def setUp(self): + self.engine = MagicMock() + self.ready_event = threading.Event() + self.thread = KVCacheRecvingThread( + tp_rank=0, + tp_size=4, + engine=self.engine, + local_engine_id="local_engine", + local_handshake_port=5555, + local_kv_caches_base_addr=[0x1000, 0x2000], + block_len=[1024, 2048], + ready_event=self.ready_event) + + def test_add_request(self): + test_req = { + "request_id": "req1", + "local_block_ids": [1, 2], + "remote_block_ids": [3, 4], + "remote_engine_id": "remote_engine", + "remote_host": "localhost", + "remote_handshake_port": 6666, + } + self.thread.add_request(**test_req) + queued = self.thread.request_queue.get_nowait() + self.assertEqual(queued["request_id"], "req1") + self.assertEqual(queued["remote_host"], "localhost") + + @patch.object(KVCacheTaskTracker, 'get_and_clear_finished_requests') + def test_get_finished_requests(self, mock_tracker): + mock_tracker.return_value = {"req1", "req2"} + result = self.thread.get_and_clear_finished_requests() + self.assertEqual(result, {"req1", "req2"}) + + +class TestSocketManagement(unittest.TestCase): + + def setUp(self): + self.engine = MagicMock() + self.ready_event = threading.Event() + self.thread = KVCacheRecvingThread( + tp_rank=0, + tp_size=4, + engine=self.engine, + local_engine_id="local_engine", + local_handshake_port=5555, + local_kv_caches_base_addr=[0x1000, 0x2000], + block_len=[1024, 2048], + ready_event=self.ready_event) + self.thread.remote_sockets = defaultdict(deque) + self.thread.remote_poller = MagicMock() + + @patch('vllm_ascend.distributed.mooncake_connector.zmq.Context') + @patch('vllm_ascend.distributed.mooncake_connector.make_zmq_socket') + def test_get_remote_socket(self, mock_make_socket, mock_context): + mock_sock = MagicMock() + mock_make_socket.return_value = mock_sock + test_host = "test_host" + test_port = 12345 + + sock = self.thread._get_remote_socket(test_host, test_port) + + self.assertEqual(sock, mock_sock) + mock_make_socket.assert_called_once() + args, kwargs = mock_make_socket.call_args + self.assertEqual(kwargs.get('path'), 'tcp://test_host:12345') + self.assertEqual(kwargs.get('socket_type'), zmq.REQ) # type: ignore + self.assertFalse(kwargs.get('bind', True)) + self.thread.remote_poller.register.assert_called_with( + mock_sock, zmq.POLLIN) # type: ignore + + def test_return_socket_to_pool(self): + mock_sock = MagicMock() + test_host = "test_host" + test_port = 12345 + test_path = make_zmq_path("tcp", test_host, test_port) + + self.thread._return_remote_socket(mock_sock, test_host, test_port) + + self.assertEqual(len(self.thread.remote_sockets[test_path]), 1) + self.assertEqual(self.thread.remote_sockets[test_path][0], mock_sock) + self.thread.remote_poller.register.assert_not_called() + + +class TestCoreFunctionality(unittest.TestCase): + + def setUp(self): + self.engine = MagicMock() + self.ready_event = threading.Event() + self.mock_queue = MagicMock() + self.thread = KVCacheRecvingThread( + tp_rank=0, + tp_size=4, + engine=self.engine, + local_engine_id="local_engine", + local_handshake_port=5555, + local_kv_caches_base_addr=[0x1000, 0x2000], + block_len=[1024, 2048], + ready_event=self.ready_event) + self.thread.request_queue = self.mock_queue + self.test_req = { + "request_id": "req1", + "local_block_ids": [1, 2], + "remote_block_ids": [3, 4], + "remote_engine_id": "remote_engine", + "remote_host": "localhost", + "remote_handshake_port": 6666, + "remote_transfer_port": 7777 + } + self.thread.task_tracker = MagicMock() + self.engine.batch_transfer_sync_read.return_value = 0 + self.thread.remote_te_port = {"remote_engine": {6666: 7777}} + + @patch.object(KVCacheRecvingThread, '_transfer_kv_cache') + @patch.object(KVCacheRecvingThread, '_send_done_recv_signal') + def test_handle_request(self, mock_send, mock_transfer): + self.thread._handle_request(self.test_req) + mock_transfer.assert_called_once_with(self.test_req) + mock_send.assert_called_once_with("req1", "localhost", 6666) + self.thread.task_tracker.update_done_task_count.assert_called_once_with( + "req1") + self.mock_queue.task_done.assert_called_once() + + @patch.object(KVCacheRecvingThread, '_get_remote_metadata') + def test_transfer_kv_cache(self, mock_get_meta): + self.thread.kv_caches_base_addr["remote_engine"] = { + 6666: [0x3000, 0x4000] + } + + self.thread._transfer_kv_cache(self.test_req) + + self.engine.batch_transfer_sync_read.assert_called_once() + call_args, call_kwargs = self.engine.batch_transfer_sync_read.call_args + self.assertEqual(call_args[0], "localhost:7777") + self.assertIsInstance(call_args[1], list) + self.assertIsInstance(call_args[2], list) + self.assertIsInstance(call_args[3], list) + self.assertEqual(len(call_args[1]), len(call_args[2])) + self.assertEqual(len(call_args[1]), len(call_args[3])) + mock_get_meta.assert_not_called() + + def test_transfer_kv_cache_failure(self): + self.engine.batch_transfer_sync_read.return_value = -1 + self.thread.kv_caches_base_addr["remote_engine"] = { + 6666: [0x3000, 0x4000] + } + + with self.assertRaises(RuntimeError): + self.thread._transfer_kv_cache(self.test_req) + + +class TestMetadataHandling(unittest.TestCase): + + def setUp(self): + self.engine = MagicMock() + self.ready_event = threading.Event() + self.thread = KVCacheRecvingThread( + tp_rank=0, + tp_size=4, + engine=self.engine, + local_engine_id="local_engine", + local_handshake_port=5555, + local_kv_caches_base_addr=[0x1000, 0x2000], + block_len=[1024, 2048], + ready_event=self.ready_event) + self.test_metadata = MooncakeAgentMetadata( + engine_id="remote_engine", + te_rpc_port=9090, + kv_caches_base_addr=[0x3000, 0x4000], + num_blocks=2) + + @patch('vllm_ascend.distributed.mooncake_connector.ensure_zmq_send') + @patch('vllm_ascend.distributed.mooncake_connector.ensure_zmq_recv') + def test_get_remote_metadata_success(self, mock_recv, mock_send): + mock_recv.return_value = msgspec.msgpack.encode(self.test_metadata) + + with patch.object(self.thread, '_get_remote_socket') as mock_get_socket, \ + patch.object(self.thread, '_return_remote_socket') as mock_return_socket: + mock_socket = MagicMock() + mock_get_socket.return_value = mock_socket + + self.thread._get_remote_metadata("host1", 5555) + + mock_get_socket.assert_called_once_with("host1", 5555) + mock_return_socket.assert_called_once_with(mock_socket, "host1", + 5555) + mock_send.assert_called_once_with( + mock_socket, self.thread.encoder.encode((GET_META_MSG, ""))) + mock_recv.assert_called_once_with(mock_socket, + self.thread.remote_poller) + self.assertEqual( + self.thread.kv_caches_base_addr["remote_engine"][5555], + [0x3000, 0x4000]) + + @patch('vllm_ascend.distributed.mooncake_connector.ensure_zmq_send') + @patch('vllm_ascend.distributed.mooncake_connector.ensure_zmq_recv', + side_effect=Exception("Network error")) + def test_get_remote_metadata_failure(self, mock_recv, mock_send): + with patch.object(self.thread, '_get_remote_socket') as mock_get_socket, \ + patch.object(self.thread, '_return_remote_socket') as mock_return_socket: + mock_socket = MagicMock() + mock_get_socket.return_value = mock_socket + + with self.assertRaises(Exception) as context: + self.thread._get_remote_metadata("host1", 5555) + + self.assertEqual(str(context.exception), "Network error") + mock_return_socket.assert_called_once() + + +class TestMainThreadLoop(unittest.TestCase): + + def setUp(self): + self.engine = MagicMock() + self.ready_event = threading.Event() + self.thread = KVCacheRecvingThread( + tp_rank=0, + tp_size=4, + engine=self.engine, + local_engine_id="local_engine", + local_handshake_port=5555, + local_kv_caches_base_addr=[0x1000, 0x2000], + block_len=[1024, 2048], + ready_event=self.ready_event) + self.thread.request_queue = queue.Queue() + + @patch.object(KVCacheRecvingThread, '_handle_request') + def test_run_loop_normal(self, mock_handle): + test_request = { + "request_id": "req1", + "local_block_ids": [1, 2], + "remote_block_ids": [3, 4], + "remote_engine_id": "remote_engine", + "remote_host": "localhost", + "remote_handshake_port": 6666, + "remote_transfer_port": 7777 + } + + self.thread.request_queue.put(test_request) + self.thread.request_queue.put(None) + + self.thread.start() + time.sleep(0.1) + self.thread.join(timeout=1.0) + + self.assertTrue(self.thread.ready_event.is_set()) + mock_handle.assert_called_once_with(test_request) + self.assertTrue(self.thread.request_queue.empty()) + + +class MockVllmConfig: + + def __init__(self): + self.model_config = MagicMock() + self.parallel_config = MagicMock() + self.cache_config = MagicMock() + self.kv_transfer_config = MagicMock() + self.model_config.use_mla = True + self.parallel_config.tensor_parallel_size = 2 + self.parallel_config.data_parallel_rank_local = 0 + self.parallel_config.data_parallel_size_local = 1 + self.cache_config.block_size = 16 + self.kv_transfer_config.kv_port = 5000 + self.kv_transfer_config.kv_role = 'kv_producer' + self.kv_transfer_config.get_from_extra_config = MagicMock() + self.kv_transfer_config.get_from_extra_config.side_effect = lambda k, d: { + "prefill": { + "tp_size": 2, + "dp_size": 1 + }, + "decode": { + "tp_size": 2, + "dp_size": 1 + } + }.get(k, d) + + +class MockRequest: + + def __init__(self, + request_id, + prompt_token_ids=None, + kv_transfer_params=None, + status=None): + self.request_id = request_id + self.prompt_token_ids = prompt_token_ids or [1, 2, 3, 4] + self.kv_transfer_params = kv_transfer_params or {} + self.status = status or "running" + self.output_token_ids = [101, 102] + + +class TestKVCacheTaskTracker(unittest.TestCase): + + def setUp(self): + self.tracker = KVCacheTaskTracker() + + def test_update_done_task_count(self): + self.assertEqual(len(self.tracker.finished_requests), 0) + self.assertEqual(len(self.tracker.delayed_free_requests), 0) + + current_time = time.time() + self.tracker.add_delayed_request("req_1", current_time) + result = self.tracker.delayed_free_requests + self.assertEqual(len(result), 1) + self.assertEqual(result[0], ("req_1", current_time)) + + self.tracker.update_done_task_count("req_1") + result_finished = self.tracker.finished_requests + result_delayed = self.tracker.delayed_free_requests + self.assertEqual(result_finished, {"req_1"}) + self.assertEqual(len(result_delayed), 0) + + def test_retrieve_expired_requests(self): + current_time = time.time() + self.tracker.add_delayed_request("req_1", current_time - 600) + self.tracker.add_delayed_request("req_2", current_time) + result = self.tracker._retrieve_expired_requests() + self.assertEqual(result, { + "req_1", + }) + result_delay = self.tracker.delayed_free_requests + self.assertEqual(len(result_delay), 1) + self.assertEqual(result_delay[0], ("req_2", current_time)) + + def test_duplicate_task_update(self): + self.tracker.update_done_task_count("req1") + self.tracker.update_done_task_count("req1") + self.tracker.update_done_task_count("req1") + + finished = self.tracker.get_and_clear_finished_requests() + self.assertEqual(finished, {"req1"}) + + +class TestMooncakeConnectorMetadata(unittest.TestCase): + + def test_add_new_req(self): + meta = MooncakeConnectorMetadata() + self.assertEqual(len(meta.requests), 0) + self.assertEqual(len(meta.requests_to_send), 0) + + meta.add_new_req(request_id="req1", + local_block_ids=[1, 2, 3], + kv_transfer_params={ + "remote_block_ids": [4, 5, 6], + "remote_engine_id": "remote_engine", + "remote_host": "localhost", + "remote_port": 5000 + }) + + self.assertEqual(len(meta.requests), 1) + req_meta = meta.requests["req1"] + self.assertIsInstance(req_meta, ReqMeta) + self.assertEqual(req_meta.local_block_ids, [1, 2, 3]) + self.assertEqual(req_meta.remote_block_ids, [4, 5, 6]) + self.assertEqual(req_meta.remote_engine_id, "remote_engine") + self.assertEqual(req_meta.remote_host, "localhost") + self.assertEqual(req_meta.remote_port, 5000) + + +class TestMooncakeConnectorSchedulerMatchedTokens(unittest.TestCase): + + def setUp(self): + config = MockVllmConfig() + self.scheduler = MooncakeConnectorScheduler(config, "test_engine") + + def test_get_num_new_matched_tokens(self): + request = MockRequest("req1") + tokens, async_flag = self.scheduler.get_num_new_matched_tokens( + request, 0) + self.assertEqual(tokens, 0) + self.assertFalse(async_flag) + + request.kv_transfer_params = {"do_remote_prefill": True} + tokens, async_flag = self.scheduler.get_num_new_matched_tokens( + request, 0) + self.assertEqual(tokens, 3) + self.assertTrue(async_flag) + + def test_build_connector_meta(self): + request = MockRequest("req1") + blocks_mock = MagicMock() + blocks_mock.get_unhashed_block_ids.return_value = [4, 5, 6] + self.scheduler._reqs_need_recv["req1"] = (request, [4, 5, 6]) + request.kv_transfer_params = { + "remote_block_ids": [1, 2, 3], + "remote_engine_id": "remote", + "remote_host": "localhost", + "remote_port": 5000 + } + + meta = self.scheduler.build_connector_meta(MagicMock()) + self.assertIsInstance(meta, MooncakeConnectorMetadata) + self.assertEqual(len(meta.requests), 1) + self.assertEqual(meta.requests["req1"].local_block_ids, [4, 5, 6]) + self.assertEqual(meta.requests["req1"].remote_block_ids, [1, 2, 3]) + self.assertEqual(len(self.scheduler._reqs_need_recv), 0) + + def test_get_finished_count(self): + count = self.scheduler.get_finished_count() + self.assertEqual(count, 2) + + +class TestHelperFunctions(unittest.TestCase): + + def test_group_concurrent_contiguous(self): + src: list[int] = [1, 2, 3, 5, 6] + dst: list[int] = [10, 11, 12, 14, 15] + + src_groups, dst_groups = group_concurrent_contiguous(src, dst) + + self.assertEqual(len(src_groups), 2) + self.assertEqual(src_groups[0], [1, 2, 3]) + self.assertEqual(src_groups[1], [5, 6]) + self.assertEqual(dst_groups[0], [10, 11, 12]) + self.assertEqual(dst_groups[1], [14, 15]) + + def test_group_concurrent_contiguous_empty(self): + src: list[int] = [] + dst: list[int] = [] + src_groups, dst_groups = group_concurrent_contiguous(src, dst) + self.assertEqual(src_groups, []) + self.assertEqual(dst_groups, []) + + def test_string_to_int64_hash(self): + hash1 = string_to_int64_hash("test_string") + hash2 = string_to_int64_hash("test_string") + self.assertEqual(hash1, hash2) + + hash3 = string_to_int64_hash("different_string") + self.assertNotEqual(hash1, hash3) + + +class TestMooncakeConnectorForScheduler(unittest.TestCase): + + def test_scheduler_role(self): + config = MockVllmConfig() + connector = MooncakeConnector(config, KVConnectorRole.SCHEDULER) + self.assertIsNotNone(connector.connector_scheduler) + self.assertIsNone(connector.connector_worker) + + @patch.object(MooncakeConnectorScheduler, "get_num_new_matched_tokens") + def test_scheduler_methods(self, mock_method): + config = MockVllmConfig() + connector = MooncakeConnector(config, KVConnectorRole.SCHEDULER) + request = MockRequest("req1") + connector.get_num_new_matched_tokens(request, 0) + mock_method.assert_called_once_with(request, 0) + + +class MockKVCacheBlocks: + + def get_unhashed_block_ids(self): + return [4, 5, 6] + + +class MockSchedulerOutput: + pass + + +class MockForwardContext: + pass + + +class TestMooncakeConnector(unittest.TestCase): + + def setUp(self): + self.config = MockVllmConfig() + os.environ["ASCEND_RT_VISIBLE_DEVICES"] = "0,1" + + def test_scheduler_initialization(self): + connector = MooncakeConnector(self.config, KVConnectorRole.SCHEDULER) + self.assertIsNotNone(connector.connector_scheduler) + self.assertIsNone(connector.connector_worker) + + @patch.object(MooncakeConnectorScheduler, "get_num_new_matched_tokens") + def test_get_num_new_matched_tokens(self, mock_method): + connector = MooncakeConnector(self.config, KVConnectorRole.SCHEDULER) + request = MockRequest("req1") + connector.get_num_new_matched_tokens(request, 0) + mock_method.assert_called_once_with(request, 0) + + @patch.object(MooncakeConnectorScheduler, "update_state_after_alloc") + def test_update_state_after_alloc(self, mock_method): + connector = MooncakeConnector(self.config, KVConnectorRole.SCHEDULER) + request = MockRequest("req1") + blocks = MockKVCacheBlocks() + connector.update_state_after_alloc(request, blocks, 3) + mock_method.assert_called_once_with(request, blocks, 3) + + @patch.object(MooncakeConnectorScheduler, "build_connector_meta") + def test_build_connector_meta(self, mock_method): + connector = MooncakeConnector(self.config, KVConnectorRole.SCHEDULER) + scheduler_output = MockSchedulerOutput() + connector.build_connector_meta(scheduler_output) + mock_method.assert_called_once_with(scheduler_output) + + @patch.object(MooncakeConnectorScheduler, "request_finished") + def test_request_finished(self, mock_method): + connector = MooncakeConnector(self.config, KVConnectorRole.SCHEDULER) + request = MockRequest("req1") + connector.request_finished(request, [1, 2, 3]) + mock_method.assert_called_once_with(request, [1, 2, 3]) + + +class TestMooncakeConnectorScheduler(unittest.TestCase): + + def setUp(self): + self.config = MockVllmConfig() + self.scheduler = MooncakeConnectorScheduler(self.config, "test_engine") + + def test_get_num_new_matched_tokens_no_remote_prefill(self): + request = MockRequest("req1") + tokens, async_flag = self.scheduler.get_num_new_matched_tokens( + request, 0) + self.assertEqual(tokens, 0) + self.assertFalse(async_flag) + + def test_get_num_new_matched_tokens_with_remote_prefill(self): + request = MockRequest("req1", + kv_transfer_params={"do_remote_prefill": True}) + tokens, async_flag = self.scheduler.get_num_new_matched_tokens( + request, 0) + self.assertEqual(tokens, 3) + self.assertTrue(async_flag) + + def test_update_state_after_alloc_no_remote_prefill(self): + request = MockRequest("req1") + blocks = MagicMock() + self.scheduler.update_state_after_alloc(request, blocks, 0) + self.assertEqual(len(self.scheduler._reqs_need_recv), 0) + + def test_update_state_after_alloc_with_remote_prefill(self): + request = MockRequest("req1", + kv_transfer_params={ + "do_remote_prefill": True, + "remote_block_ids": [1, 2, 3], + "remote_engine_id": "remote", + "remote_host": "localhost", + "remote_port": 5000 + }) + blocks = MockKVCacheBlocks() + self.scheduler.update_state_after_alloc(request, blocks, 3) + self.assertEqual(len(self.scheduler._reqs_need_recv), 1) + self.assertEqual(self.scheduler._reqs_need_recv["req1"][0], request) + self.assertEqual(self.scheduler._reqs_need_recv["req1"][1], [4, 5, 6]) + + def test_request_finished_no_remote_decode(self): + request = MockRequest("req1") + delay_free, params = self.scheduler.request_finished( + request, [1, 2, 3]) + self.assertFalse(delay_free) + self.assertIsNone(params) + + +class TestUtils(unittest.TestCase): + + def test_string_to_int64_hash(self): + h1 = string_to_int64_hash("hello") + h2 = string_to_int64_hash("hello") + h3 = string_to_int64_hash("world") + self.assertEqual(h1, h2) + self.assertNotEqual(h1, h3) + self.assertIsInstance(h1, int) + + def test_group_concurrent_contiguous(self): + src: list[int] = [1, 2, 3, 5, 6] + dst: list[int] = [10, 11, 12, 20, 21] + src_g, dst_g = group_concurrent_contiguous(src, dst) + self.assertEqual(src_g, [[1, 2, 3], [5, 6]]) + self.assertEqual(dst_g, [[10, 11, 12], [20, 21]]) + + def test_group_empty(self): + src_g, dst_g = group_concurrent_contiguous([], []) + self.assertEqual(src_g, []) + self.assertEqual(dst_g, []) + + def test_zmq_ctx_invalid_type(self): + with self.assertRaises(ValueError): + with zmq_ctx("INVALID", "tcp://127.0.0.1:5555"): + pass + + @patch("vllm_ascend.distributed.mooncake_connector.make_zmq_socket") + def test_zmq_ctx_ok(self, mock_make_socket): + mock_socket = MagicMock() + mock_make_socket.return_value = mock_socket + with zmq_ctx(zmq.REQ, "tcp://localhost:1234") as s: # type: ignore + self.assertEqual(s, mock_socket) + + @patch("vllm_ascend.distributed.mooncake_connector.logger") + def test_ensure_zmq_send_success(self, mock_logger): + mock_socket = MagicMock() + ensure_zmq_send(mock_socket, b"hello") + mock_socket.send.assert_called_once_with(b"hello") + + @patch("vllm_ascend.distributed.mooncake_connector.logger") + def test_ensure_zmq_send_retry_and_fail(self, mock_logger): + mock_socket = MagicMock() + mock_socket.send.side_effect = zmq.ZMQError( # type: ignore + "send failed") + with self.assertRaises(RuntimeError): + ensure_zmq_send(mock_socket, b"hello", max_retries=2) + self.assertEqual(mock_socket.send.call_count, 2) + + @patch("vllm_ascend.distributed.mooncake_connector.logger") + def test_ensure_zmq_recv_success(self, mock_logger): + mock_socket = MagicMock() + mock_socket.recv.return_value = b"response" + mock_poller = MagicMock() + mock_poller.poll.return_value = [ + (mock_socket, zmq.POLLIN) # type: ignore + ] + data = ensure_zmq_recv(mock_socket, mock_poller) + self.assertEqual(data, b"response") + + @patch("vllm_ascend.distributed.mooncake_connector.logger") + def test_ensure_zmq_recv_timeout_and_fail(self, mock_logger): + mock_socket = MagicMock() + mock_poller = MagicMock() + mock_poller.poll.return_value = [] + with self.assertRaises(RuntimeError): + ensure_zmq_recv(mock_socket, + mock_poller, + timeout=0.01, + max_retries=2) + + +class MockMooncakeAgentMetadata: + + def __init__(self, **kwargs): + pass + + +class MockMooncakeConnectorMetadata: + + def __init__(self): + self.requests = {} + + +class MockKVCacheSendingThread(threading.Thread): + + def __init__(self, *args, **kwargs): + super().__init__() + self.daemon = True + self._finished_requests = set() + + def get_and_clear_finished_requests(self): + return self._finished_requests + + def start(self): + pass + + +class MockKVCacheRecvingThread(threading.Thread): + + def __init__(self, *args, **kwargs): + super().__init__() + self.daemon = True + self._finished_requests = set() + self.add_request = MagicMock() + + def get_and_clear_finished_requests(self): + return self._finished_requests + + def start(self): + pass + + +class MockTensor: + + def __init__(self, *args, **kwargs): + self.size = MagicMock(return_value=(10, 16, 8, 16)) + self.element_size = MagicMock(return_value=4) + self.shape = (10, 16, 8, 16) + self.data_ptr = MagicMock(return_value=0x1000) + + +mock_envs_ascend = MagicMock() +mock_envs_ascend.MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol" + +mock_logger = MagicMock() + + +class MockTransferEngine: + + def initialize(self, *args, **kwargs): + return 0 + + def register_memory(self, *args, **kwargs): + return 1 + + +class MockEnvsAscend: + MOONCAKE_CONNECTOR_PROTOCOL = "mock_protocol" + PHYSICAL_DEVICES = "10,11" + + +def mock_get_tensor_model_parallel_rank(): + return 0 + + +def mock_get_tp_group(): + return MagicMock() + + +def mock_get_ip(): + return "127.0.0.1" + + +def mock_string_to_int64_hash(s): + return hash(s) + + +class TestMooncakeConnectorWorker(unittest.TestCase): + + def setUp(self): + self.envs_ascend_mock = MockEnvsAscend() + self.mock_transfer_engine = MagicMock() + self.mock_transfer_engine.get_rpc_port.return_value = 9090 + self.mock_transfer_engine.initialize.return_value = 0 + self.mock_transfer_engine.register_memory.return_value = 0 + + self.patches = [ + patch('os.getenv', return_value="10,11"), + patch('torch.Tensor.size', return_value=(10, 16, 8, 16)), + patch('torch.Tensor.element_size', return_value=4), + patch('torch.Tensor.data_ptr', return_value=0x1000), + patch('math.prod', return_value=128), + patch('random.Random'), + patch( + 'vllm_ascend.distributed.mooncake_connector.get_tensor_model_parallel_rank', + mock_get_tensor_model_parallel_rank), + patch('vllm_ascend.distributed.mooncake_connector.get_tp_group', + mock_get_tp_group), + patch('vllm_ascend.distributed.mooncake_connector.get_ip', + mock_get_ip), + patch( + 'vllm_ascend.distributed.mooncake_connector.string_to_int64_hash', + mock_string_to_int64_hash), + patch('vllm_ascend.distributed.mooncake_connector.TransferEngine', + return_value=self.mock_transfer_engine), + patch( + 'vllm_ascend.distributed.mooncake_connector.KVCacheSendingThread', + MagicMock()), + patch( + 'vllm_ascend.distributed.mooncake_connector.KVCacheRecvingThread', + MagicMock()), + patch('vllm_ascend.distributed.mooncake_connector.logger', + MagicMock()), + patch('vllm_ascend.distributed.mooncake_connector.threading.Event', + MagicMock()), + patch.dict('sys.modules', + {'vllm_ascend.envs': self.envs_ascend_mock}), + ] + + for p in self.patches: + p.start() # type: ignore + + self.vllm_config = MockVllmConfig() + self.engine_id = "test_engine" + self.kv_caches = {"layer1": (MagicMock(), MagicMock())} + + def tearDown(self): + for p in self.patches: + p.stop() # type: ignore + + def test_register_kv_caches_producer(self): + worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id) + worker.register_kv_caches(self.kv_caches) + self.assertEqual(len(worker.kv_caches), 1) + self.assertIsNotNone(worker.kv_send_thread) + self.assertIsNone(worker.kv_recv_thread) + + def test_register_kv_caches_consumer(self): + self.vllm_config.kv_transfer_config.kv_role = 'kv_consumer' + worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id) + worker.register_kv_caches(self.kv_caches) + self.assertIsNone(worker.kv_send_thread) + self.assertIsNotNone(worker.kv_recv_thread) + + def test_register_kv_caches_mla_case(self): + mla_cache1 = MagicMock() + mla_cache1.size.return_value = (10, 16, 1, 16) + mla_cache2 = MagicMock() + mla_cache2.size.return_value = (10, 16, 1, 8) + mla_caches = {"layer1": (mla_cache1, mla_cache2)} + + worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id) + worker.register_kv_caches(mla_caches) + self.assertTrue(worker.use_mla) + self.assertEqual(len(worker.block_len), 2) + + def test_device_id_selection_with_physical_devices(self): + # Test with physical devices set + worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id) + # Default tp_rank is 0, so device_id should be 10 + self.assertEqual(worker.device_id, 10) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/ut/kv_connector/test_remote_decode_lifecycle.py b/tests/ut/kv_connector/test_remote_decode_lifecycle.py new file mode 100644 index 0000000..bf44c0f --- /dev/null +++ b/tests/ut/kv_connector/test_remote_decode_lifecycle.py @@ -0,0 +1,169 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# +import copy + +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.request import FinishReason, RequestStatus + +from tests.ut.kv_connector.utils import (assert_scheduler_empty, + create_model_runner_output, + create_request, create_scheduler, + create_vllm_config) + + +def test_basic_lifecycle(): + """Test lifecycle of a Remote Decode request.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request = create_request(request_id=1, + max_tokens=1, + num_tokens=NUM_TOKENS, + do_remote_decode=True) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): Prefill. + # (1a): schedule() + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + + # (1b): execute_model() + model_runner_output = create_model_runner_output(reqs=[request]) + + # (1c): update_from_output() + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + + # Ensure the request is finished after 1 tokens. + assert request.is_finished() + assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED + output = engine_core_outputs[0].outputs[0] + assert output.finish_reason == FinishReason.LENGTH + assert output.kv_transfer_params is not None + + # Request freed in Scheduler and blocks should be freed + assert request_id in scheduler.finished_req_ids + assert len(scheduler.running) == 0 + assert len(scheduler.waiting) == 0 + + # ... but blocks should not be freed. + blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_id] + for block in blocks: + assert block.ref_cnt == 1 + + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 0 + assert len(scheduler_output.finished_req_ids) == 1 + assert request_id in scheduler_output.finished_req_ids + assert len(scheduler_output.scheduled_new_reqs) == 0 + assert scheduler_output.scheduled_cached_reqs.num_reqs == 0 + assert len(scheduler.finished_req_ids) == 0 + + # (2b): execute_model() + model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT + + # (2c): update_from_output() + scheduler.update_from_output(scheduler_output, model_runner_output) + + # STEP (3): Finished sending. + # (3a): schedule() - pass finished request to PB. + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 0 + assert len(scheduler_output.finished_req_ids) == 0 + assert len(scheduler_output.scheduled_new_reqs) == 0 + assert scheduler_output.scheduled_cached_reqs.num_reqs == 0 + assert len(scheduler.finished_req_ids) == 0 + + # (3b): execute_model() + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorOutput # type: ignore # noqa + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=[request_id]) + + # (3c): update_from_output() + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Confirm we do not have any memory leaks after req lifecycle. + assert_scheduler_empty(scheduler) + + +def test_prefix_cache_lifecycle(): + """Test that remote decode params still works with a prefix cache hit.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # Prime the KVCache. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 3 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request_remote_a = create_request(request_id=1, num_tokens=NUM_TOKENS) + + scheduler.add_request(request_remote_a) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote_a], + use_eos=True) + scheduler.update_from_output(scheduler_output, model_runner_output) + scheduler.schedule() + scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT) + + ##################### + # Actual Test: confirm we send all blocks. + + # Step (1): Send the KV Transfer. + NUM_EXTERNAL_FULL_BLOCKS -= 1 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + request_remote = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_decode=True) + + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output(reqs=[request_remote]) + eco = scheduler.update_from_output(scheduler_output, model_runner_output) + kv_transfer_params = eco[0].outputs[0].kv_transfer_params + # Ensure we send all block ids, even if there is a cache hit. + assert (len( + kv_transfer_params["remote_block_ids"]) == (NUM_EXTERNAL_FULL_BLOCKS + + 1)) + + # STEP (2): Ensure it is freed. + scheduler_output = scheduler.schedule() + scheduler.schedule() + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorOutput # noqa + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_sending=[request_remote.request_id]) + scheduler.update_from_output(scheduler_output, model_runner_output) + _ = scheduler.schedule() + assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/test_remote_prefill_lifecycle.py b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py new file mode 100644 index 0000000..c9b8891 --- /dev/null +++ b/tests/ut/kv_connector/test_remote_prefill_lifecycle.py @@ -0,0 +1,239 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# +import copy + +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT +from vllm.v1.request import RequestStatus + +from tests.ut.kv_connector.utils import (assert_scheduler_empty, + create_model_runner_output, + create_request, create_scheduler, + create_vllm_config) + + +def test_basic_lifecycle(): + """Test lifecycle of a remote prefill.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + START_FREE_BLOCK_QUEUE_SIZE = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True, + block_size=BLOCK_SIZE) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): + # (1a): schedule() + scheduler_output = scheduler.schedule() + + # Nothing running and empty scheduler output. + assert len(scheduler.running) == 0 + assert len(scheduler_output.scheduled_new_reqs) == 0 + assert scheduler_output.scheduled_cached_reqs.num_reqs == 0 + assert len(scheduler_output.num_scheduled_tokens) == 0 + assert scheduler_output.total_num_scheduled_tokens == 0 + + # Req waiting for KVs with no computed/scheduled toks ... + assert len(scheduler.waiting) == 1 + assert request in scheduler.waiting + assert (request.status == RequestStatus.WAITING_FOR_REMOTE_KVS) + assert (request.num_computed_tokens == 0) + + # ... but should have (uncached) blocks allocated to it. + block_pool = scheduler.kv_cache_manager.block_pool + assert (block_pool.free_block_queue.num_free_blocks + < START_FREE_BLOCK_QUEUE_SIZE) + assert len(block_pool.cached_block_hash_to_block) == 0 + blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_id] + for block in blocks: + assert block._block_hash is None + + # (1b): forward() + model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT + + # (1c): update_from_output() + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + assert not engine_core_outputs or not engine_core_outputs[0].outputs + + # STEP (2): + # (2a): schedule(): nothing happens! + scheduler_output = scheduler.schedule() + assert len(scheduler.waiting) == 1 + assert len(scheduler.running) == 0 + + # (2b): forward(): request finishes recv. + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorOutput # type: ignore # noqa + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_recving=[request_id]) + + # (2c): update_from_output(): + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + assert len(scheduler.waiting) == 1 + assert (request_id in scheduler.finished_recving_kv_req_ids) + + # STEP (3): + # (3a): schedule(): this should actually schedule. + scheduler_output = scheduler.schedule() + assert len(scheduler.running) == 1 + + # Confirm the block are actually allocated. + num_hashed_blocks = 0 + blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_id] + for block in blocks: + assert block.ref_cnt == 1 + num_hashed_blocks += (1 if block._block_hash is not None else 0) + assert num_hashed_blocks == NUM_EXTERNAL_FULL_BLOCKS + + # Confirm the rest of the prompt is scheduled in this step. + scheduled_req = scheduler_output.scheduled_new_reqs[0] + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[request_id] + num_computed_tokens = scheduled_req.num_computed_tokens + total_prompt_tokens = len(scheduled_req.prompt_token_ids) + assert (num_scheduled_tokens == total_prompt_tokens - num_computed_tokens) + + # (3b): execute_model() + model_runner_output = create_model_runner_output([request]) + # (3c): update_from_output() + scheduler.update_from_output(scheduler_output, model_runner_output) + + # Step (4): Hit EOS. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output([request], use_eos=True) + engine_core_outputs = scheduler.update_from_output(scheduler_output, + model_runner_output) + scheduler.schedule() + + assert_scheduler_empty(scheduler) + + +def test_no_spurious_prefix_caching(): + """ + With P/D, blocks can be allocated but uncomputed for + multiple engine steps. This test confirms that we do + not accidentally have cache hits against uncomputed + blocks. + """ + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 and a half full external blocks. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * (NUM_EXTERNAL_FULL_BLOCKS + 0.5)) + + # Both of these requests have prompts like [1,1,1,1,1, ...] + request_remote = create_request( + request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True, + use_all_1s_for_prompt_tokens=True, + ) + + # Schedule the remote prefill request. This should not + # cause any blocks to be cached. + scheduler.add_request(request_remote) + scheduler_output = scheduler.schedule() + scheduler.update_from_output(scheduler_output, EMPTY_MODEL_RUNNER_OUTPUT) + assert len(scheduler.waiting) == 1 + + remote_blocks = scheduler.kv_cache_manager.coordinator.single_type_managers[ + 0].req_to_blocks[request_remote.request_id] + + # Remote blocks should not be cached. + for block in remote_blocks: + assert block.ref_cnt == 1 + assert block._block_hash is None + + +def test_full_block_prompt(): + """Test that we handle a prompt that is the full block size.""" + + vllm_config = create_vllm_config() + scheduler = create_scheduler(vllm_config) + + # 2 Full Blocks and 1 Half Block. + BLOCK_SIZE = vllm_config.cache_config.block_size + NUM_EXTERNAL_FULL_BLOCKS = 2 + NUM_TOKENS = int(BLOCK_SIZE * NUM_EXTERNAL_FULL_BLOCKS) + + request = create_request(request_id=1, + num_tokens=NUM_TOKENS, + do_remote_prefill=True) + + scheduler.add_request(request) + request_id = request.request_id + + # STEP (1): Initialize a recv. + scheduler_output = scheduler.schedule() + # All blocks should be allocated. + num_blocks = len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]) + assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS + model_runner_output = EMPTY_MODEL_RUNNER_OUTPUT + scheduler.update_from_output(scheduler_output, model_runner_output) + + # # STEP (2): Recv. + scheduler_output = scheduler.schedule() + model_runner_output = copy.deepcopy(EMPTY_MODEL_RUNNER_OUTPUT) + from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorOutput # type: ignore # noqa + model_runner_output.kv_connector_output = KVConnectorOutput( + finished_recving=[request_id]) + scheduler.update_from_output(scheduler_output, model_runner_output) + assert len(scheduler.waiting) == 1 + assert (request_id in scheduler.finished_recving_kv_req_ids) + + # # STEP (3): Run as usual. + scheduler_output = scheduler.schedule() + + # We need to recompute the final token of the prompt to generate + # the first new token, so we should not have a new block. + num_blocks = len(scheduler.kv_cache_manager.coordinator. + single_type_managers[0].req_to_blocks[request_id]) + assert num_blocks == NUM_EXTERNAL_FULL_BLOCKS + assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens == + NUM_TOKENS - 1) + assert (scheduler_output.num_scheduled_tokens[request_id] == 1) + + model_runner_output = create_model_runner_output([request]) + scheduler.update_from_output(scheduler_output, model_runner_output) + + # # Step (4): Hit EOS. + scheduler_output = scheduler.schedule() + model_runner_output = create_model_runner_output([request], use_eos=True) + scheduler.schedule() + + assert_scheduler_empty(scheduler) diff --git a/tests/ut/kv_connector/utils.py b/tests/ut/kv_connector/utils.py new file mode 100644 index 0000000..13711e7 --- /dev/null +++ b/tests/ut/kv_connector/utils.py @@ -0,0 +1,233 @@ +# SPDX-License-Identifier: Apache-2.0 +# This code is from: https://github.com/vllm-project/vllm/tests/v1/kv_connector/unit/utils.py +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. + +import os +from typing import Any, Optional + +import torch +from vllm import SamplingParams +from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig, + ModelConfig, SchedulerConfig, VllmConfig) +from vllm.v1.core.kv_cache_utils import (get_request_block_hasher, + init_none_hash) +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request +from vllm.v1.structured_output import StructuredOutputManager + +from vllm_ascend.utils import vllm_version_is + +EOS_TOKEN_ID = 50256 +os.environ["VLLM_USE_V1"] = "1" + + +def assert_scheduler_empty(scheduler: Scheduler): + """Confirm the scheduler is "empty" - i.e. no leaks.""" + # Scheduler Metadata. + assert len(scheduler.requests) == 0 + assert len(scheduler.waiting) == 0 + assert len(scheduler.running) == 0 + assert len(scheduler.finished_req_ids) == 0 + assert len(scheduler.finished_recving_kv_req_ids) == 0 + + # EncoderCacheManager. + assert len(scheduler.encoder_cache_manager.freed) == 0 + assert len(scheduler.encoder_cache_manager.cached) == 0 + + # KVCache Manager. + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + req_to_blocks) == 0 + assert len(scheduler.kv_cache_manager.coordinator.single_type_managers[0]. + num_cached_block) == 0 + num_free_blocks = ( + scheduler.kv_cache_manager.block_pool.free_block_queue.num_free_blocks) + assert num_free_blocks == ( + scheduler.kv_cache_manager.block_pool.num_gpu_blocks - 1) + + # NOTE(rob): just the ref count on blocks will be 0. The hash + # value, etc will remain since we lazily evict for prefix cache. + for block in scheduler.kv_cache_manager.block_pool.blocks: + assert block.ref_cnt == 0 + + +def create_vllm_config( + max_num_seqs: int = 16, + max_num_batched_tokens: int = 1024, + block_size: int = 128, +) -> VllmConfig: + """Initialize VllmConfig For Testing.""" + scheduler_config = SchedulerConfig( + max_num_seqs=max_num_seqs, + max_num_batched_tokens=max_num_batched_tokens, + max_model_len=max_num_batched_tokens, + ) + fake_weight_path = os.path.join(os.path.dirname(__file__), "..", + "fake_weight") + model_config = ModelConfig( + model=fake_weight_path, + skip_tokenizer_init=True, + ) + # Cache config, optionally force APC + cache_config = CacheConfig( + block_size=block_size, + gpu_memory_utilization=0.9, + swap_space=0, + cache_dtype="auto", + enable_prefix_caching=True, + ) + kv_transfer_config = KVTransferConfig( + kv_connector="LLMDataDistCMgrConnector", + kv_role="kv_both", + kv_connector_module_path= + "vllm_ascend.distributed.llmdatadist_c_mgr_connector") + return VllmConfig(scheduler_config=scheduler_config, + model_config=model_config, + cache_config=cache_config, + kv_transfer_config=kv_transfer_config, + device_config=DeviceConfig("cpu")) + + +def create_scheduler( + vllm_config: VllmConfig, + num_blocks: int = 10000, +) -> Scheduler: + """Initialize Scheduler For Testing.""" + block_size = vllm_config.cache_config.block_size + kv_cache_config = KVCacheConfig( + num_blocks=num_blocks, # A large number of blocks to hold all requests + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer'], + FullAttentionSpec(block_size, 1, 1, torch.float16, + False)) + ], + ) + vllm_config.cache_config.num_gpu_blocks = num_blocks + return Scheduler( + vllm_config=vllm_config, + kv_cache_config=kv_cache_config, + log_stats=True, + structured_output_manager=StructuredOutputManager(vllm_config), + ) + + +_none_hash_initialized = False + + +def create_request( + request_id: int, + num_tokens: int = 10, + max_tokens: int = 128, + do_remote_decode: bool = False, + do_remote_prefill: bool = False, + use_all_1s_for_prompt_tokens: bool = False, + num_remote_blocks: int = 3, + block_size: int = 16, +) -> Request: + """Make dummy request for testing.""" + global _none_hash_initialized + if not _none_hash_initialized: + init_none_hash(hash) + _none_hash_initialized = True + + block_hasher = get_request_block_hasher(block_size, hash) + + kv_transfer_params: Optional[dict[str, Any]] = None + + if do_remote_decode: + assert not do_remote_prefill + kv_transfer_params = dict(do_remote_prefill=False, + do_remote_decode=True) + elif do_remote_prefill: + kv_transfer_params = dict(do_remote_prefill=True, + do_remote_decode=False, + remote_engine_id="my-engine-id", + remote_block_ids=list( + range(num_remote_blocks)), + remote_host="my-host", + remote_port=1234, + remote_tp_size=1) + + max_tokens = 1 if do_remote_decode else max_tokens + sampling_params = SamplingParams(max_tokens=max_tokens) + + if use_all_1s_for_prompt_tokens: + prompt_token_ids = [1] * num_tokens + else: + prompt_token_ids = [i * request_id for i in range(num_tokens)] + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + req = Request( + request_id=f"id-{request_id}", + prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params, + multi_modal_kwargs=None, + multi_modal_placeholders=None, + multi_modal_hashes=None, + pooling_params=[], + eos_token_id=EOS_TOKEN_ID, + block_hasher=block_hasher, + ) + else: + req = Request( + request_id=f"id-{request_id}", + prompt_token_ids=prompt_token_ids, + sampling_params=sampling_params, + pooling_params=[], + eos_token_id=EOS_TOKEN_ID, + block_hasher=block_hasher, + ) + req.kv_transfer_params = kv_transfer_params + return req + + +def create_model_runner_output( + reqs: list[Request], + finished_sending: Optional[list[str]] = None, + finished_recving: Optional[list[str]] = None, + use_eos: bool = False, +) -> ModelRunnerOutput: + """Make dummy model runner output for testing.""" + + # Make request data. + req_ids = [req.request_id for req in reqs] + req_id_to_index = {req_id: idx for idx, req_id in enumerate(req_ids)} + + # Make sampled tokens. + sampled_token = EOS_TOKEN_ID if use_eos else 0 + sampled_token_ids = [[sampled_token] for _ in req_ids] + + # Make output data structure. + extra_args = {} + from vllm.v1.worker.kv_connector_model_runner_mixin import \ + KVConnectorOutput # type: ignore # noqa + kv_connector_output = KVConnectorOutput(finished_sending=finished_sending, + finished_recving=finished_recving) + extra_args = {"kv_connector_output": kv_connector_output} + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=req_ids, + req_id_to_index=req_id_to_index, + sampled_token_ids=sampled_token_ids, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[], + **extra_args, + ) + + return model_runner_output diff --git a/tests/ut/models/__init__.py b/tests/ut/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/ut/models/test_deepseek_mtp.py b/tests/ut/models/test_deepseek_mtp.py new file mode 100644 index 0000000..61fdf98 --- /dev/null +++ b/tests/ut/models/test_deepseek_mtp.py @@ -0,0 +1,195 @@ +import pytest +import torch +from pytest_mock import MockerFixture +from transformers import PretrainedConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig + +from tests.ut.base import PytestBase +from vllm_ascend.models.deepseek_mtp import ( + CustomDeepSeekMTP, CustomDeepSeekMultiTokenPredictor, + CustomDeepSeekMultiTokenPredictorLayer) + + +class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase): + + @pytest.fixture + def setup_mtp_layer(self, mocker: MockerFixture): + config = PretrainedConfig(vocab_size=1000, + hidden_size=768, + rms_norm_eps=1e-5) + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm.model_executor.layers.layernorm.RMSNorm.__init__", + return_value=None) + mocker.patch( + "vllm.model_executor.models.deepseek_mtp.SharedHead.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__", + return_value=None) + mocker_deepseek_v2_decode_layer = mocker.patch( + "vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None) + mocker_deepseek_v2_decode_layer.assert_called_once() + return mtp_layer + + def test_init(self, mocker: MockerFixture, setup_mtp_layer): + mtp_layer = setup_mtp_layer + assert isinstance(mtp_layer, CustomDeepSeekMultiTokenPredictorLayer) + + def test_forward(self, mocker: MockerFixture, setup_mtp_layer): + mtp_layer = setup_mtp_layer + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch.object(mtp_layer, + 'eh_proj', + return_value=torch.randn(2, 3, 768)) + mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768)) + mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768), + torch.randn(2, 3, 768)) + + input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]) + positions = torch.tensor([[0, 1, 2], [0, 1, 2]]) + kv_cache = torch.randn(2, 3, 768) + previous_hidden_states = torch.randn(2, 3, 768) + inputs_embeds = torch.tensor([[1.0, 2.0, 3.0]]) + + output = mtp_layer(input_ids, positions, kv_cache, None, + previous_hidden_states, inputs_embeds, 0) + assert output.shape == (2, 3, 768) + + +class TestCustomDeepSeekMultiTokenPredictor(PytestBase): + + @pytest.fixture + def setup_predictor(self, mocker: MockerFixture): + mock_vllm_config = mocker.MagicMock(spec=VllmConfig) + mock_model_config = mocker.MagicMock(spec=ModelConfig) + mock_hf_config = mocker.MagicMock() + mock_hf_config.num_hidden_layers = 12 + mock_hf_config.num_nextn_predict_layers = 3 + mock_hf_config.vocab_size = 30000 + mock_model_config.hf_config = mock_hf_config + mock_vllm_config.model_config = mock_model_config + mock_vllm_config.cache_config = CacheConfig() + mock_vllm_config.quant_config = mocker.MagicMock() + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + predictor = CustomDeepSeekMultiTokenPredictor( + vllm_config=mock_vllm_config) + return predictor + + def test_init(self, mocker: MockerFixture, setup_predictor): + predictor = setup_predictor + assert predictor.num_mtp_layers == 3 + assert isinstance(predictor, CustomDeepSeekMultiTokenPredictor) + + @pytest.mark.parametrize( + 'kv_caches, inputs_embeds', + [(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))]) + def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches, + inputs_embeds): + predictor = setup_predictor + mock_layer = mocker.MagicMock() + mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) + predictor.layers_list = [mock_layer] + + # todo: need or not? + # predictor.num_mtp_layers = 1 + input_ids = torch.tensor([[1, 2, 3]]) + positions = torch.tensor([[0, 1, 2]]) + mocker.patch( + "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", + return_value=torch.tensor([[1.0, 2.0, 3.0]])) + output = predictor.forward(input_ids, positions, kv_caches, None, None, + inputs_embeds, 0) + mock_layer.assert_called_once() + assert torch.allclose(output, torch.tensor([1.0, 2.0, 3.0])) + + def test_compute_logits(self, mocker: MockerFixture, setup_predictor): + hidden_states = torch.tensor([[1, 2, 3], [4, 5, 6]]) + predictor = setup_predictor + + mock_layer = mocker.MagicMock() + mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) + predictor.layers_list = [mock_layer] + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "vllm.model_executor.layers.logits_processor.LogitsProcessor.__init__", + return_value=None) + predictor.logits_processor.return_value = torch.tensor([1.0, 2.0, 3.0]) + + result_logits = predictor.compute_logits(hidden_states=hidden_states, + sampling_metadata=None) + predictor.logits_processor.assert_called_once() + assert torch.allclose(result_logits, torch.tensor([1.0, 2.0, 3.0])) + + +class TestCustomDeepSeekMTP(PytestBase): + + @pytest.fixture + def setup_mtp(self, mocker: MockerFixture): + vllm_config = mocker.MagicMock() + vllm_config.model_config.hf_config.num_hidden_layers = 12 + vllm_config.model_config.hf_config.num_nextn_predict_layers = 3 + vllm_config.cache_config = mocker.MagicMock() + vllm_config.quant_config = mocker.MagicMock() + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__", + return_value=None) + mocker.patch("vllm.model_executor.layers.sampler.get_sampler", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + mtp = CustomDeepSeekMTP(vllm_config=vllm_config) + return mtp + + def test_init(self, mocker: MockerFixture, setup_mtp): + mtp = setup_mtp + assert isinstance(mtp, CustomDeepSeekMTP) + + def test_forward(self, mocker: MockerFixture, setup_mtp): + input_ids = torch.tensor([[1, 2, 3]]) + positions = torch.tensor([[0, 1, 2]]) + kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])] + previous_hidden_states = torch.tensor([[0.1, 0.2, 0.3]]) + inputs_embeds = torch.tensor([[0.1, 0.2, 0.3]]) + spec_step_idx = 0 + setup_mtp.model.return_value = torch.tensor([[1.0, 2.0, 3.0]]) + + output = setup_mtp.forward(input_ids, positions, kv_caches, None, + previous_hidden_states, inputs_embeds, + spec_step_idx) + assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]])) diff --git a/tests/ut/models/test_deepseek_v2.py b/tests/ut/models/test_deepseek_v2.py new file mode 100644 index 0000000..df14a2a --- /dev/null +++ b/tests/ut/models/test_deepseek_v2.py @@ -0,0 +1,295 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import pytest +import torch +from transformers import PretrainedConfig +from vllm.config import CacheConfig +from vllm.distributed.parallel_state import GroupCoordinator + +from vllm_ascend.models.deepseek_v2 import ( + CustomDeepseekV2MergedReplicatedLinear, CustomDeepseekV2MLAAttention, + CustomDeepseekV2MLP, CustomDeepseekV2MoE, + CustomDeepseekV2RowParallelLinear, + CustomDeepseekV2RowParallelLinearReplaceAllreduce, + CustomDeepseekV2SiluAndMul, LogitsProcessor, ParallelLMHead) + + +@pytest.fixture +def base_config(): + config = PretrainedConfig( + hidden_size=128, + num_attention_heads=8, + num_hidden_layers=2, + intermediate_size=256, + hidden_act="silu", + rms_norm_eps=1e-6, + rope_theta=10000.0, + max_position_embeddings=2048, + n_routed_experts=4, + n_shared_experts=1, + moe_intermediate_size=256, + num_experts_per_tok=2, + routed_scaling_factor=1.0, + first_k_dense_replace=0, + moe_layer_freq=1, + kv_lora_rank=16, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + topk_method="noaux_tc", + scoring_func="softmax", + norm_topk_prob=True, + n_group=1, + topk_group=1, + vocab_size=10000, + ) + return config + + +@pytest.fixture +def vllm_config(base_config): + model_config = SimpleNamespace( + hf_config=base_config, + tensor_parallel_size=1, + dtype=torch.float32, + use_mla=False, + quant_config=None, + max_model_len=2048, + ) + + cache_config = CacheConfig() + vllm_config = Mock() + vllm_config.model_config = model_config + vllm_config.cache_config = cache_config + vllm_config.quant_config = None + return vllm_config + + +@pytest.fixture +def mock_distributed(): + tp_group = Mock(spec=GroupCoordinator) + tp_group.rank_in_group = 0 + tp_group.world_size = 1 + tp_group.device_group = Mock() + + dp_group = Mock(spec=GroupCoordinator) + dp_group.rank_in_group = 0 + dp_group.world_size = 1 + + ep_group = Mock(spec=GroupCoordinator) + ep_group.rank_in_group = 0 + ep_group.world_size = 1 + + pp_group = Mock(spec=GroupCoordinator) + pp_group.rank_in_group = 0 + pp_group.world_size = 1 + + mock_vllm_config = Mock() + mock_vllm_config.scheduler_config = Mock(max_num_seqs=256) + mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None) + + with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \ + patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \ + patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_ep_group", return_value=ep_group), \ + patch("vllm_ascend.models.deepseek_v2.get_dp_group", return_value=dp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \ + patch("vllm_ascend.models.deepseek_v2.get_pp_group", + return_value=Mock(is_first_rank=False, is_last_rank=False)), \ + patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \ + patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group, + _PP=pp_group), \ + patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \ + patch("torch.npu.current_device", return_value=0): + yield + + +@pytest.fixture +def mock_forward_context(): + forward_context = Mock(in_profile_run=False, with_prefill=False) + with patch("vllm_ascend.models.deepseek_v2.get_forward_context", + return_value=forward_context): + yield + + +def test_custom_deepseek_v2_silu_and_mul(): + torch.set_default_device("cpu") + + silu = CustomDeepseekV2SiluAndMul() + assert silu.weight_scale is None + + x = torch.randn(2, 4) + output = silu.forward_oot(x) + assert output.shape == (2, 2) + + weight_scale = Mock(return_value=torch.tensor(0.1)) + silu = CustomDeepseekV2SiluAndMul(weight_scale=weight_scale) + quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32) + dynamic_scale = torch.randn(2, 1) + with patch("torch_npu.npu_dequant_swiglu_quant", + return_value=torch.randn(2, 4)): + output = silu.forward_oot((quant_x, dynamic_scale)) + assert output.shape == (2, 4) + + +def test_custom_deepseek_v2_merged_replicated_linear(mock_distributed): + linear = CustomDeepseekV2MergedReplicatedLinear(input_size=128, + output_sizes=[64, 64], + bias=False, + quant_config=None) + assert linear.output_sizes == [64, 64] + + param = Mock() + param.data = torch.zeros(128, 128) + param.output_dim = 1 + param.is_gguf_weight = False + param.is_gguf_weight_type = False + loaded_weight = torch.randn(128, 64) + linear.weight_loader(param, loaded_weight, loaded_shard_id=0) + + with pytest.raises(AssertionError): + linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0) + + +@pytest.mark.parametrize("cls", [ + CustomDeepseekV2RowParallelLinearReplaceAllreduce, + CustomDeepseekV2RowParallelLinear +]) +def test_row_parallel_linear(cls, mock_distributed): + linear = cls(input_size=128, output_size=64, bias=False, quant_config=None) + linear.quant_method = Mock() + linear.quant_method.apply.return_value = torch.randn(2, 4, 64) + + input_ = torch.randn(2, 4, 128) + with patch("vllm_ascend.models.deepseek_v2.split_tensor_along_last_dim", + return_value=[torch.randn(2, 4, 64)]): + linear.input_is_parallel = False + output = linear(input_, is_prefill=True) + assert output[0].shape == (2, 4, 64) + + linear.input_is_parallel = True + output = linear(input_, is_prefill=False) + assert output[0].shape == (2, 4, 64) + + +def test_custom_deepseek_v2_mlp(mock_distributed, base_config): + mlp = CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=None) + assert isinstance(mlp.act_fn, CustomDeepseekV2SiluAndMul) + + x = torch.randn(2, 4, 128) + output = mlp(x) + assert output.shape == (2, 4, 128) + + with patch("vllm_ascend.models.deepseek_v2.QuantizationConfig" + ) as mock_quant_config: + mock_quant_config.name = "w8a8dynamic" + with pytest.raises(NotImplementedError): + CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=mock_quant_config, + force_replicate=False) + with pytest.raises(ValueError): + CustomDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="relu", + quant_config=None) + + +def test_custom_deepseek_v2_moe(mock_distributed, base_config, + mock_forward_context): + base_config.n_shared_experts = 1 + moe = CustomDeepseekV2MoE(config=base_config, + quant_config=None, + prefix="mlp") + assert moe.top_k == 2 + + x = torch.randn(2, 4, 128) + attn_metadata = Mock(num_prefills=1) + with patch("vllm_ascend.ops.fused_moe.AscendFusedMoE.__call__", + return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))): + output = moe(x, attn_metadata) + assert output.shape == (2, 4, 128) + + +@patch("torch_npu.npu_rms_norm") +def test_custom_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed, + base_config): + mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128)) + + attn = CustomDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=16, + kv_lora_rank=16, + cache_config=CacheConfig(), + quant_config=None, + prefix="layers.0.self_attn") + assert attn.debug_layer_idx == 0 + + x = torch.randn(2, 4, 128) + positions = torch.arange(4).repeat(2, 1) + with patch.object(attn.mla_attn, + "__call__", + return_value=torch.randn(2, 4, 128)): + with pytest.raises(AssertionError): + attn(positions, x) + + attn = CustomDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=None, + kv_lora_rank=16, + prefix="layers.1.self_attn") + assert hasattr(attn, "q_proj") + + +def test_deepseek_v2_lmhead(mock_distributed, vllm_config): + # 创建一个简单的配置对象 + class SimpleConfig: + + def __init__(self): + self.vocab_size = 10000 + self.hidden_size = 128 + + config = SimpleConfig() + + # 直接创建lmhead和logits_processor + lmhead = ParallelLMHead(config.vocab_size, config.hidden_size) + logits_processor = LogitsProcessor(config.vocab_size) + + # 创建模拟输出 + mock_output = torch.randn(2, 4, config.hidden_size) + mock_logits = torch.randn(2, 4, config.vocab_size) + + # 直接测试logits_processor + with patch.object(lmhead.quant_method, "apply", return_value=mock_logits): + with patch.object(logits_processor, + "_gather_logits", + return_value=mock_logits): + logits = logits_processor(lmhead, mock_output) + assert logits.shape == (2, 4, config.vocab_size) diff --git a/tests/ut/models/test_qwen2_5_vl.py b/tests/ut/models/test_qwen2_5_vl.py new file mode 100644 index 0000000..15367eb --- /dev/null +++ b/tests/ut/models/test_qwen2_5_vl.py @@ -0,0 +1,424 @@ +import pytest +import torch +import torch.nn.functional as F +from pytest_mock import MockerFixture + +from tests.ut.base import PytestBase +from vllm_ascend.models.qwen2_5_vl import ( + AscendQwen2_5_VisionAttention, AscendQwen2_5_VisionBlock, + AscendQwen2_5_VisionPatchEmbed, AscendQwen2_5_VisionRotaryEmbedding, + AscendQwen2_5_VisionTransformer, AscendQwen2_5_VLForConditionalGeneration) + + +class TestAscendQwen2_5_VisionAttention(PytestBase): + + def init_attention( + self, + mocker, + embed_dim=1000, + num_heads=10, + projection_size=100, + quant_config=None, + prefix="", + ): + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionAttention.__init__") + + attention = AscendQwen2_5_VisionAttention( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + args, kwargs = mocker_attn.call_args + assert args == (embed_dim, num_heads, projection_size, None, "") + assert not kwargs + attention.num_attention_heads_per_partition = num_heads + return attention + + def test_attn_init_should_normal(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 10 + projection_size = 100 + quant_config = None + prefix = "" + vit = self.init_attention( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + mocker=mocker, + ) + assert vit.embed_dim == 1000 + assert vit.hidden_size_per_attention_head == 10 + + def test_attn_init_should_raise_error(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 7 + projection_size = 100 + quant_config = None + prefix = "" + with pytest.raises(AssertionError): + # projection_size should divided by num heads + self.init_attention( + mocker=mocker, + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + + def test_split_qkv(self, mocker: MockerFixture): + attention = self.init_attention(mocker=mocker) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + q, k, v = attention.split_qkv(torch.rand((100, 10, 300))) + assert q.shape == (100, 10, 10, 10) + assert k.shape == (100, 10, 10, 10) + assert v.shape == (100, 10, 10, 10) + + def test_attn_forward(self, mocker: MockerFixture): + attention = self.init_attention(mocker=mocker) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + + qkv = lambda x: (x, 0) # noqa + split_qkv = lambda x: [ #noqa + torch.rand((100, 3, 10, 128)) for i in range(3) + ] # noqa + npu_rotary_mul = lambda q, cos, sin: q # noqa + _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa + proj = lambda x: (x, 0) # noqa + + mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) + mocker_split_qkv = mocker.patch.object( + attention, + "split_qkv", + side_effect=split_qkv, + ) + mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", + side_effect=npu_rotary_mul) + mocker_npu_flash_attention_unpad = mocker.patch( + "torch_npu._npu_flash_attention_unpad", + side_effect=_npu_flash_attention_unpad, + ) + mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) + attention.__dict__["qkv"] = mocker_qkv + attention.__dict__["split_qkv"] = mocker_split_qkv + attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul + attention.__dict__["_npu_flash_attention_unpad"] = ( + mocker_npu_flash_attention_unpad) + attention.__dict__["proj"] = mocker_proj + + output = attention.forward( + x=x, + cu_seqlens=cu_seqlens, + cos=cos, + sin=sin, + ) + qkv_args, qkv_kwargs = mocker_qkv.call_args + assert qkv_args == (x, ) + assert not qkv_kwargs + + split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args + assert split_qkv_args == (x, ) + assert not split_qkv_kwargs + + npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args + assert npu_rotary_mul_args[1:] == (cos, sin) + assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) + assert not npu_rotary_mul_kwargs + + assert output.shape == torch.Size([100, 3, 1280]) + + +class TestAscendQwen2_5_VisionBlock(PytestBase): + + def init_vision_block( + self, + mocker, + dim=100, + num_heads=10, + mlp_hidden_dim=100, + ): + mocker_vit = mocker.patch( + "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", + return_value=None, + ) + + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionAttention.__init__", + return_value=None, + ) + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + vision_block = AscendQwen2_5_VisionBlock( + dim=dim, + num_heads=num_heads, + mlp_hidden_dim=mlp_hidden_dim, + ) + args, kwargs = mocker_vit.call_args + assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") + assert not kwargs + + args1, kwargs1 = mocker_attn.call_args + assert not args1 + assert kwargs1 == { + "embed_dim": dim, + "num_heads": num_heads, + "projection_size": dim, + "quant_config": None, + "prefix": ".attn", + } + return vision_block + + def test_init_vision_block_should_normal( + self, + mocker: MockerFixture, + ): + vision_block = self.init_vision_block(mocker) + assert isinstance(vision_block, AscendQwen2_5_VisionBlock) + + def test_vision_block_forward(self, mocker: MockerFixture): + x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + vision_block = self.init_vision_block(mocker) + mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) + mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) + vision_block.__dict__["attn"] = mocker_attn + vision_block.__dict__["mlp"] = mocker_mlp + + output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) + + _, attn_kwargs = mocker_attn.call_args + assert attn_kwargs == { + "cu_seqlens": cu_seqlens, + "cos": cos, + "sin": sin, + } + + assert torch.all(x * 3 == output) + + +class TestAscendQwen2_5_VisionPatchEmbed(PytestBase): + + def test_forward(self): + patch_embed = AscendQwen2_5_VisionPatchEmbed() + + ret = patch_embed(torch.rand((120, 1176))) + assert ret.shape == (120, 1152) + + +class TestAscendQwen2_5_VisionRotaryEmbedding(PytestBase): + + def init_rotary_embedding( + self, + mocker, + dim=128, + ): + mocker_ebed = mocker.patch( + "vllm_ascend.models.qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding.__init__", + return_value=None, + ) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + rotary_embedding = AscendQwen2_5_VisionRotaryEmbedding(dim=dim, ) + args, kwargs = mocker_ebed.call_args + assert args == (dim, 10000.0) + assert not kwargs + return rotary_embedding + + def test_init_rotary_embedding_should_normal(self, mocker: MockerFixture): + rotary_embedding = self.init_rotary_embedding(mocker) + assert isinstance(rotary_embedding, + AscendQwen2_5_VisionRotaryEmbedding) + + +class TestAscendQwen2_5_VisionTransformer(PytestBase): + + input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) + + def init_vision_transformer( + self, + mocker, + ): + norm_eps = 1e-6 + vision_config = mocker.MagicMock() + vision_config.patch_size = 16 + vision_config.temporal_patch_size = 2 + vision_config.in_channels = 3 + vision_config.hidden_act = "gelu" + vision_config.depth = 0 + vision_config.num_heads = 10 + vision_config.hidden_size = 300 + + mocker.patch( + "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_rank", + return_value=0, + ) + mocker.patch("vllm.distributed.utils.divide", return_value=100) + mocker.patch( + "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", + return_value=2, + ) + mocker.patch( + "vllm.model_executor.layers.linear.divide", + return_value=2, + ) + mocker.patch( + "vllm.model_executor.layers.linear.get_tensor_model_parallel_rank", + return_value=0) + mocker.patch( + "vllm_ascend.models.qwen2_5_vl.parallel_state.get_tensor_model_parallel_world_size", + return_value=2, + ) + + vision_transformer = AscendQwen2_5_VisionTransformer( + vision_config, + norm_eps, + ) + + assert not vision_transformer.interleaved + return vision_transformer + + def test_init_vision_transformer(self, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + assert isinstance(vision_transformer, AscendQwen2_5_VisionTransformer) + + @pytest.mark.parametrize( + "interleaved, expected", + [ + ( + False, + torch.tensor([ + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + ]), + ), + ( + True, + torch.tensor([ + input_data[0, 0].cos(), + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[0, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + input_data[1, 1].cos(), + ]), + ), + ], + ) + def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + vision_transformer.__dict__["interleaved"] = interleaved + vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 + vision_transformer.hidden_size_per_attention_head = 4 + cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) + assert cos_new.shape == (1, 32, 1, 2) + + def test_forward(self, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + x = torch.randn(1, 3, 224, 224) + grid_thw = torch.tensor([[1, 4, 4]]) + mocker_patch_embed = mocker.patch.object( + vision_transformer, + "patch_embed", + side_effect=lambda _: torch.randn(16, 512), # noqa + ) + mocker_rot_pos_emb = mocker.patch.object( + vision_transformer, + "rot_pos_emb", + side_effect=lambda _: torch.randn(16, 64), # noqa + ) + mocker_get_window_index = mocker.patch.object( + vision_transformer, + "get_window_index", + side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa + ) + mocker_cal_cos_sin = mocker.patch.object( + vision_transformer, + "cal_cos_sin", + side_effect=lambda _: + (torch.randn(16, 32), torch.randn(16, 32)), # noqa + ) + mocker_merger = mocker.patch.object( + vision_transformer, + "merger", + side_effect=lambda _: torch.randn(16, 256), # noqa + ) + vision_transformer.__dict__["vision_blocks"] = [ + lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa + ] + vision_transformer.__dict__["patch_embed"] = mocker_patch_embed + vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb + vision_transformer.__dict__[ + "get_window_index"] = mocker_get_window_index + vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin + vision_transformer.__dict__["merger"] = mocker_merger + vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] + vision_transformer.__dict__["spatial_merge_unit"] = 2 + ret = vision_transformer.forward(x, grid_thw) + assert ret.shape == (8, 256) + mocker_patch_embed.assert_called_with(x) + mocker_rot_pos_emb.assert_called_with(grid_thw) + mocker_get_window_index.assert_called_with(grid_thw) + mocker_cal_cos_sin.assert_called_once() + mocker_merger.assert_called_once() + + +class TestAscendQwen2_5_VLForConditionalGeneration(PytestBase): + + def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): + vllm_config = mocker.MagicMock() + vllm_config.vision_config = "vision_config" + vllm_config.rms_norm_eps = 1e-5 + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker_vl = mocker.patch( + "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", + return_value=None, + ) + mocker_vit = mocker.patch( + "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionTransformer.__init__", + return_value=None, + ) + + vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration( + vllm_config=vllm_config) + args, kwargs = mocker_vl.call_args + assert not args + assert kwargs == {"vllm_config": vllm_config, "prefix": ""} + mocker_vit.assert_called_once() + assert isinstance( + vl_for_conditional_generation, + AscendQwen2_5_VLForConditionalGeneration, + ) diff --git a/tests/ut/models/test_qwen2_5_vl_without_padding.py b/tests/ut/models/test_qwen2_5_vl_without_padding.py new file mode 100644 index 0000000..00caf81 --- /dev/null +++ b/tests/ut/models/test_qwen2_5_vl_without_padding.py @@ -0,0 +1,422 @@ +import pytest +import torch +import torch.nn.functional as F +from pytest_mock import MockerFixture +from vllm.model_executor.models.qwen2_5_vl import \ + Qwen2_5_VLForConditionalGeneration + +from tests.ut.base import PytestBase +from vllm_ascend.models.qwen2_5_vl_without_padding import ( + AscendQwen2_5_VisionAttention_Without_Padding, + AscendQwen2_5_VisionBlock_Without_Padding, + AscendQwen2_5_VisionPatchEmbed_Without_Padding, + AscendQwen2_5_VisionTransformer_Without_Padding, + AscendQwen2_5_VLForConditionalGeneration_Without_Padding) + + +class TestAscendQwen2_5_VisionAttention_Without_Padding(PytestBase): + + def init_attention( + self, + mocker, + embed_dim=1000, + num_heads=10, + projection_size=100, + quant_config=None, + prefix="", + ): + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.Qwen2_5_VisionAttention.__init__" + ) + + attention = AscendQwen2_5_VisionAttention_Without_Padding( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + args, kwargs = mocker_attn.call_args + assert args == (embed_dim, num_heads, projection_size, None, "") + assert not kwargs + attention.num_attention_heads_per_partition = num_heads + return attention + + def test_vit_init_should_normal(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 10 + projection_size = 100 + quant_config = None + prefix = "" + vit = self.init_attention( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + mocker=mocker, + ) + assert vit.embed_dim == 1000 + assert vit.hidden_size_per_attention_head == 10 + + def test_vit_init_should_raise_error(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 7 + projection_size = 100 + quant_config = None + prefix = "" + with pytest.raises(AssertionError): + # projection_size should divided by num heads + self.init_attention( + mocker=mocker, + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + + def test_vit_forward(self, mocker: MockerFixture): + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + attention = self.init_attention(mocker=mocker) + x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + + qkv = lambda x: (x, 0) # noqa + split_qkv = lambda x: [ #noqa + torch.rand((100, 3, 10, 128)) for i in range(3) + ] # noqa + npu_rotary_mul = lambda q, cos, sin: q # noqa + _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa + proj = lambda x: (x, 0) # noqa + + mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) + mocker_split_qkv = mocker.patch.object( + attention, + "split_qkv", + side_effect=split_qkv, + ) + mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", + side_effect=npu_rotary_mul) + mocker_npu_flash_attention_unpad = mocker.patch( + "torch_npu._npu_flash_attention_unpad", + side_effect=_npu_flash_attention_unpad, + ) + mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) + attention.__dict__["qkv"] = mocker_qkv + attention.__dict__["split_qkv"] = mocker_split_qkv + attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul + attention.__dict__["_npu_flash_attention_unpad"] = ( + mocker_npu_flash_attention_unpad) + attention.__dict__["proj"] = mocker_proj + + output = attention.forward( + x=x, + cu_seqlens=cu_seqlens, + cos=cos, + sin=sin, + ) + qkv_args, qkv_kwargs = mocker_qkv.call_args + assert qkv_args == (x, ) + assert not qkv_kwargs + + split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args + assert split_qkv_args == (x, ) + assert not split_qkv_kwargs + + npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args + assert npu_rotary_mul_args[1:] == (cos, sin) + assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) + assert not npu_rotary_mul_kwargs + + assert output.shape == torch.Size([100, 3, 1280]) + + +class TestAscendQwen2_5_VisionBlock_Without_Padding(PytestBase): + + def init_vision_block( + self, + mocker, + dim=100, + num_heads=10, + mlp_hidden_dim=100, + ): + mocker_vit = mocker.patch( + "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionBlock.__init__", + return_value=None, + ) + + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionAttention_Without_Padding.__init__", + return_value=None, + ) + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + vision_block = AscendQwen2_5_VisionBlock_Without_Padding( + dim=dim, + num_heads=num_heads, + mlp_hidden_dim=mlp_hidden_dim, + ) + args, kwargs = mocker_vit.call_args + assert args == (dim, num_heads, mlp_hidden_dim, F.silu, None, None, "") + assert not kwargs + + args1, kwargs1 = mocker_attn.call_args + assert not args1 + assert kwargs1 == { + "embed_dim": dim, + "num_heads": num_heads, + "projection_size": dim, + "quant_config": None, + "prefix": ".attn", + } + return vision_block + + def test_init_vision_block_should_normal( + self, + mocker: MockerFixture, + ): + vision_block = self.init_vision_block(mocker) + assert isinstance(vision_block, + AscendQwen2_5_VisionBlock_Without_Padding) + + def test_vision_block_forward(self, mocker: MockerFixture): + x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + vision_block = self.init_vision_block(mocker) + mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) + mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) + vision_block.__dict__["attn"] = mocker_attn + vision_block.__dict__["mlp"] = mocker_mlp + + output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) + + _, attn_kwargs = mocker_attn.call_args + assert attn_kwargs == { + "cu_seqlens": cu_seqlens, + "cos": cos, + "sin": sin, + } + + assert torch.all(x * 3 == output) + + +class TestAscendQwen2_5_VisionPatchEmbed_Without_Padding(PytestBase): + + def test_forward(self): + patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding() + + ret = patch_embed(torch.rand((120, 1176))) + assert ret.shape == (120, 1152) + + +class TestAscendQwen2_5_VisionTransformer_Without_Padding(PytestBase): + + input_data = torch.tensor([[0.1, 0.2], [0.3, 0.4]]) + + def init_vision_transformer( + self, + mocker, + ): + norm_eps = 1e-6 + vision_config = mocker.MagicMock() + vision_config.patch_size = 16 + vision_config.temporal_patch_size = 2 + vision_config.in_channels = 3 + vision_config.hidden_act = "gelu" + vision_config.depth = 0 + vision_config.hidden_size = 1280 + vision_config.num_heads = 16 + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker_vit = mocker.patch( + "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__", + return_value=None, + ) + mocker_vision_rotary_embedding = mocker.patch( + "vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__", + return_value=None, + ) + mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__", + return_value=None, + ) + mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionPatchEmbed_Without_Padding.__init__", + return_value=None, + ) + mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_world_size", + return_value=1, + ) + mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.parallel_state.get_tensor_model_parallel_rank", + return_value=0, + ) + mocker.patch("vllm.distributed.utils.divide", return_value=100) + + vision_transformer = AscendQwen2_5_VisionTransformer_Without_Padding( + vision_config, + norm_eps, + ) + args, kwargs = mocker_vit.call_args + assert args == (vision_config, norm_eps, None, "") + assert not kwargs + mocker_vision_rotary_embedding.assert_called_once() + return vision_transformer + + def test_init_vision_transformer(self, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + assert isinstance(vision_transformer, + AscendQwen2_5_VisionTransformer_Without_Padding) + + @pytest.mark.parametrize( + "interleaved, expected", + [ + ( + False, + torch.tensor([ + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + ]), + ), + ( + True, + torch.tensor([ + input_data[0, 0].cos(), + input_data[0, 0].cos(), + input_data[0, 1].cos(), + input_data[0, 1].cos(), + input_data[1, 0].cos(), + input_data[1, 0].cos(), + input_data[1, 1].cos(), + input_data[1, 1].cos(), + ]), + ), + ], + ) + def test_cal_cos_sin(self, interleaved, expected, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + vision_transformer.__dict__["interleaved"] = interleaved + vision_transformer.__dict__["hidden_size_per_attention_head"] = 2 + vision_transformer.hidden_size_per_attention_head = 4 + cos_new, _ = vision_transformer.cal_cos_sin(self.input_data) + assert cos_new.shape == (1, 4, 1, 2) + assert torch.allclose(cos_new.view(-1), expected) + + def test_forward(self, mocker: MockerFixture): + vision_transformer = self.init_vision_transformer(mocker) + x = torch.randn(1, 3, 224, 224) + grid_thw = torch.tensor([[1, 4, 4]]) + mocker_patch_embed = mocker.patch.object( + vision_transformer, + "patch_embed", + side_effect=lambda _: torch.randn(16, 512), # noqa + ) + mocker_rot_pos_emb = mocker.patch.object( + vision_transformer, + "rot_pos_emb", + side_effect=lambda _: torch.randn(16, 64), # noqa + ) + mocker_get_window_index = mocker.patch.object( + vision_transformer, + "get_window_index", + side_effect=lambda _: (torch.arange(8), [4, 8, 12, 16]), # noqa + ) + mocker_cal_cos_sin = mocker.patch.object( + vision_transformer, + "cal_cos_sin", + side_effect=lambda _: + (torch.randn(16, 32), torch.randn(16, 32)), # noqa + ) + mocker_merger = mocker.patch.object( + vision_transformer, + "merger", + side_effect=lambda _: torch.randn(16, 256), # noqa + ) + vision_transformer.__dict__["vision_blocks"] = [ + lambda *args, **kwargs: torch.randn(16, 1, 512) # noqa + ] + vision_transformer.__dict__["patch_embed"] = mocker_patch_embed + vision_transformer.__dict__["rot_pos_emb"] = mocker_rot_pos_emb + vision_transformer.__dict__[ + "get_window_index"] = mocker_get_window_index + vision_transformer.__dict__["cal_cos_sin"] = mocker_cal_cos_sin + vision_transformer.__dict__["merger"] = mocker_merger + vision_transformer.__dict__["fullatt_block_indexes"] = [0, 2] + vision_transformer.__dict__["spatial_merge_unit"] = 2 + ret = vision_transformer.forward(x, grid_thw) + assert ret.shape == (8, 256) + mocker_patch_embed.assert_called_with(x) + mocker_rot_pos_emb.assert_called_with(grid_thw) + mocker_get_window_index.assert_called_with(grid_thw) + mocker_cal_cos_sin.assert_called_once() + mocker_merger.assert_called_once() + + +class TestAscendQwen2_5_VLForConditionalGeneration_Without_Padding(PytestBase): + + def test_init_vl_for_conditional_generation(self, mocker: MockerFixture): + vllm_config = mocker.MagicMock() + vllm_config.vision_config = "vision_config" + vllm_config.rms_norm_eps = 1e-5 + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker_vl = mocker.patch( + "vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.__init__", + return_value=None, + ) + mocker_vit = mocker.patch( + "vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionTransformer_Without_Padding.__init__", + return_value=None, + ) + + vl_for_conditional_generation = AscendQwen2_5_VLForConditionalGeneration_Without_Padding( + vllm_config=vllm_config) + args, kwargs = mocker_vl.call_args + assert not args + assert kwargs == {"vllm_config": vllm_config, "prefix": ""} + mocker_vit.assert_called_once() + assert isinstance( + vl_for_conditional_generation, + AscendQwen2_5_VLForConditionalGeneration_Without_Padding, + ) + + def test_overridden_methods(self): + self.assert_method_overridden( + AscendQwen2_5_VLForConditionalGeneration_Without_Padding, + Qwen2_5_VLForConditionalGeneration, + "_process_image_input", + ) + + self.assert_method_overridden( + AscendQwen2_5_VLForConditionalGeneration_Without_Padding, + Qwen2_5_VLForConditionalGeneration, + "_process_video_input", + ) + + @staticmethod + def assert_method_overridden(subclass, parent, method_name: str): + """assert subclass override parent method""" + parent_func = parent.__dict__.get(method_name) + child_func = subclass.__dict__.get(method_name) + + assert child_func is not None, f"{subclass.__name__} should defined {method_name}" + assert child_func is not parent_func, f"{method_name} should override in {subclass.__name__}" diff --git a/tests/ut/models/test_qwen2_vl.py b/tests/ut/models/test_qwen2_vl.py new file mode 100644 index 0000000..d62b859 --- /dev/null +++ b/tests/ut/models/test_qwen2_vl.py @@ -0,0 +1,200 @@ +import pytest +import torch +from pytest_mock import MockerFixture +from vllm.model_executor.layers.activation import QuickGELU + +from tests.ut.base import PytestBase +from vllm_ascend.models.qwen2_vl import (AscendQwen2VisionAttention, + AscendQwen2VisionBlock) + + +class TestAscendQwen2VisionAttention(PytestBase): + + def init_attention( + self, + mocker, + embed_dim=1000, + num_heads=10, + projection_size=100, + quant_config=None, + prefix="", + ): + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_vl.Qwen2VisionAttention.__init__") + + attention = AscendQwen2VisionAttention( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + args, kwargs = mocker_attn.call_args + assert args == (embed_dim, num_heads, projection_size, None, "") + assert not kwargs + attention.num_attention_heads_per_partition = num_heads + return attention + + def test_attn_init_should_normal(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 10 + projection_size = 100 + quant_config = None + prefix = "" + vit = self.init_attention( + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + mocker=mocker, + ) + assert vit.hidden_size_per_attention_head == 10 + + def test_attn_init_should_raise_error(self, mocker: MockerFixture): + embed_dim = 1000 + num_heads = 7 + projection_size = 100 + quant_config = None + prefix = "" + with pytest.raises(AssertionError): + # projection_size should divided by num heads + self.init_attention( + mocker=mocker, + embed_dim=embed_dim, + num_heads=num_heads, + projection_size=projection_size, + quant_config=quant_config, + prefix=prefix, + ) + + def test_attn_forward(self, mocker: MockerFixture): + attention = self.init_attention(mocker=mocker) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + x = torch.rand((100, 3, 10 * 3 * 128)) # s,b, head*3*head_dim + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + + qkv = lambda x: (x, 0) # noqa + split_qkv = lambda x: [ #noqa + torch.rand((100, 3, 10, 128)) for i in range(3) + ] # noqa + npu_rotary_mul = lambda q, cos, sin: q # noqa + _npu_flash_attention_unpad = lambda **kwargs: kwargs["out"] # noqa + proj = lambda x: (x, 0) # noqa + + mocker_qkv = mocker.patch.object(attention, "qkv", side_effect=qkv) + mocker_split_qkv = mocker.patch.object( + attention, + "split_qkv", + side_effect=split_qkv, + ) + mocker_npu_rotary_mul = mocker.patch("torch_npu.npu_rotary_mul", + side_effect=npu_rotary_mul) + mocker_npu_flash_attention_unpad = mocker.patch( + "torch_npu._npu_flash_attention_unpad", + side_effect=_npu_flash_attention_unpad, + ) + mocker_proj = mocker.patch.object(attention, "proj", side_effect=proj) + attention.__dict__["qkv"] = mocker_qkv + attention.__dict__["split_qkv"] = mocker_split_qkv + attention.__dict__["npu_rotary_mul"] = mocker_npu_rotary_mul + attention.__dict__["_npu_flash_attention_unpad"] = ( + mocker_npu_flash_attention_unpad) + attention.__dict__["proj"] = mocker_proj + + output = attention.forward( + x=x, + cu_seqlens=cu_seqlens, + cos=cos, + sin=sin, + ) + qkv_args, qkv_kwargs = mocker_qkv.call_args + assert qkv_args == (x, ) + assert not qkv_kwargs + + split_qkv_args, split_qkv_kwargs = mocker_split_qkv.call_args + assert split_qkv_args == (x, ) + assert not split_qkv_kwargs + + npu_rotary_mul_args, npu_rotary_mul_kwargs = mocker_npu_rotary_mul.call_args + assert npu_rotary_mul_args[1:] == (cos, sin) + assert npu_rotary_mul_args[0].shape == torch.Size([3, 100, 10, 128]) + assert not npu_rotary_mul_kwargs + + assert output.shape == torch.Size([100, 3, 1280]) + + +class TestAscendQwen2VisionBlock(PytestBase): + + def init_vision_block( + self, + mocker, + dim=100, + num_heads=10, + mlp_ratio=0.5, + ): + mocker_vit = mocker.patch( + "vllm.model_executor.models.qwen2_vl.Qwen2VisionBlock.__init__", + return_value=None, + ) + + mocker_attn = mocker.patch( + "vllm_ascend.models.qwen2_vl.AscendQwen2VisionAttention.__init__", + return_value=None, + ) + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + vision_block = AscendQwen2VisionBlock( + dim=dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + ) + args, kwargs = mocker_vit.call_args + assert args == (dim, num_heads, mlp_ratio, QuickGELU, None, None, "") + assert not kwargs + + args1, kwargs1 = mocker_attn.call_args + assert not args1 + assert kwargs1 == { + "embed_dim": dim, + "num_heads": num_heads, + "projection_size": dim, + "quant_config": None, + "prefix": ".attn", + } + return vision_block + + def test_init_vision_block_should_normal( + self, + mocker: MockerFixture, + ): + vision_block = self.init_vision_block(mocker) + assert isinstance(vision_block, AscendQwen2VisionBlock) + + def test_vision_block_forward(self, mocker: MockerFixture): + x = torch.randint(1, 100, (100, 3, 1280)) # s,b,d + cu_seqlens = torch.tensor([10, 50, 100]) + cos = torch.rand((1, 100, 1, 128)) + sin = torch.rand((1, 100, 1, 128)) + vision_block = self.init_vision_block(mocker) + mocker_attn = mocker.patch.object(vision_block, "attn", return_value=x) + mocker_mlp = mocker.patch.object(vision_block, "mlp", return_value=x) + vision_block.__dict__["attn"] = mocker_attn + vision_block.__dict__["mlp"] = mocker_mlp + + output = vision_block.forward(x.clone(), cu_seqlens, cos, sin) + + _, attn_kwargs = mocker_attn.call_args + assert attn_kwargs == { + "cu_seqlens": cu_seqlens, + "cos": cos, + "sin": sin, + } + + assert torch.all(x * 3 == output) diff --git a/tests/ut/models/test_qwen3_moe.py b/tests/ut/models/test_qwen3_moe.py new file mode 100644 index 0000000..e882fe2 --- /dev/null +++ b/tests/ut/models/test_qwen3_moe.py @@ -0,0 +1,98 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import math +import unittest + +import pytest +import torch +from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM + +from vllm_ascend.models.qwen3_moe import CustomQwen3MoeForCausalLM +from vllm_ascend.torchair.models.qwen3_moe import CustomQwen3MoeAttention + + +class TestCustomQwen3MoeForCausalLM: + + def test_class_inheritance(self): + assert issubclass(CustomQwen3MoeForCausalLM, Qwen3MoeForCausalLM) + + @pytest.mark.parametrize("key, expected", [ + ("qkv_proj", ["q_proj", "k_proj", "v_proj"]), + ("gate_up_proj", ["gate_proj", "up_proj"]), + ("experts", + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]), + ]) + def test_packed_modules_mapping(self, key, expected): + assert CustomQwen3MoeForCausalLM.packed_modules_mapping[ + key] == expected + + def test_packed_modules_mapping_structure(self): + expected_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": [ + "experts.0.gate_proj", "experts.0.up_proj", + "experts.0.down_proj" + ] + } + assert CustomQwen3MoeForCausalLM.packed_modules_mapping == expected_mapping + + +class DummyRMSNorm: + + def __init__(self, dim: int, eps: float = 1e-6): + self.dim = dim + self.eps = eps + + def __call__(self, x): + mean_sq = x.pow(2).mean(dim=-1, keepdim=True) + denom = (mean_sq + self.eps).sqrt() + return x / denom + + +class TestCustomQwen3MoeAttention(unittest.TestCase): + + def setUp(self): + self.batch = 2 + self.seq_len = 3 + self.q_size = 8 + self.kv_size = 8 + self.head_dim = 4 + self.rms_eps = 1e-6 + + total_dim = self.q_size + 2 * self.kv_size + + self.qkv = torch.arange(self.batch * self.seq_len * total_dim, + dtype=torch.float32).reshape( + self.batch, self.seq_len, total_dim) + + def test_constant_input_normalization(self): + ones_qkv = torch.ones((1, 1, self.q_size + 2 * self.kv_size), + dtype=torch.float32) + + q_norm = DummyRMSNorm(self.head_dim, self.rms_eps) + k_norm = DummyRMSNorm(self.head_dim, self.rms_eps) + q, k, v = CustomQwen3MoeAttention.normalize_qkv( + ones_qkv, self.q_size, self.kv_size, self.head_dim, q_norm, k_norm) + + norm_val = 1.0 / math.sqrt(1.0 + self.rms_eps) + + expected_q = torch.full((1, 1, self.q_size), norm_val) + expected_k = torch.full((1, 1, self.kv_size), norm_val) + expected_v = torch.ones((1, 1, self.kv_size), dtype=torch.float32) + + self.assertTrue(torch.allclose(q, expected_q, atol=1e-6)) + self.assertTrue(torch.allclose(k, expected_k, atol=1e-6)) + self.assertTrue(torch.equal(v, expected_v)) diff --git a/tests/ut/multistream/test_base.py b/tests/ut/multistream/test_base.py new file mode 100644 index 0000000..4bdd29b --- /dev/null +++ b/tests/ut/multistream/test_base.py @@ -0,0 +1,32 @@ +from tests.ut.base import TestBase +from vllm_ascend.multistream.base import (MSAttentionMetadataSplitConfig, + MSEventKey) + + +class Testbase(TestBase): + + def test_ms_event_key(self): + self.assertEqual(MSEventKey.ATTN_COM_FINISH.value, 0) + self.assertEqual(MSEventKey.ATTN_AR_FINISH.value, 1) + self.assertEqual(MSEventKey.FFN_COM_FINISH.value, 2) + self.assertEqual(MSEventKey.FFN_AR_FINISH.value, 3) + self.assertEqual(MSEventKey.MOE_BEFORE_COMM.value, 4) + self.assertEqual(MSEventKey.MOE_AFTER_COMM.value, 5) + self.assertEqual(MSEventKey.MOE_SE_COMM_FINISH.value, 6) + self.assertEqual(MSEventKey.MOE_SE_COMP_FINISH.value, 7) + self.assertEqual(MSEventKey.MOE_GATE_FINISH.value, 8) + + def test_ms_attention_metadata_split_config_default(self): + config = MSAttentionMetadataSplitConfig() + self.assertEqual(config.num_micro_batches, 2) + self.assertEqual(config.min_total_tokens_to_split, 256) + self.assertEqual(config.min_prefill_tokens_to_split, 64) + + def test_ms_attention_metadata_split_config_custom(self): + config = MSAttentionMetadataSplitConfig( + num_micro_batches=4, + min_total_tokens_to_split=512, + min_prefill_tokens_to_split=128) + self.assertEqual(config.num_micro_batches, 4) + self.assertEqual(config.min_total_tokens_to_split, 512) + self.assertEqual(config.min_prefill_tokens_to_split, 128) diff --git a/tests/ut/multistream/test_decorator.py b/tests/ut/multistream/test_decorator.py new file mode 100644 index 0000000..bd3da94 --- /dev/null +++ b/tests/ut/multistream/test_decorator.py @@ -0,0 +1,47 @@ +import pytest +from pytest_mock import MockFixture + +from tests.ut.base import PytestBase +from vllm_ascend.multistream.decorator import set_multistream_support + + +class Context: + + def __init__(self, attn_metadata=None): + self.attn_metadata = attn_metadata + + +class TestDecorator(PytestBase): + + @pytest.mark.parametrize( + 'layer_context, microbatch_context, expected_metadata', [ + ((-1, None, None), -1, { + "original": True + }), + ((-1, None, None), 0, { + "original": True + }), + ((0, None, None), -1, { + "original": True + }), + ((0, None, [{ + "new": True + }]), 0, { + "new": True + }), + ]) + def test_decorator(self, mocker: MockFixture, layer_context, + microbatch_context, expected_metadata): + + def context_func(): + return Context(attn_metadata={"original": True}) + + mocker.patch( + 'vllm_ascend.multistream.decorator.get_multistream_layer_context', + return_value=layer_context) + mocker.patch( + 'vllm_ascend.multistream.decorator.get_multistream_microbatch_context', + return_value=microbatch_context) + + context = set_multistream_support()(context_func)() + assert context.attn_metadata == expected_metadata diff --git a/tests/ut/multistream/test_layers.py b/tests/ut/multistream/test_layers.py new file mode 100644 index 0000000..cf34c6a --- /dev/null +++ b/tests/ut/multistream/test_layers.py @@ -0,0 +1,198 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from tests.ut.base import PytestBase +from vllm_ascend.multistream.base import MSEventKey +from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer, + MultiStreamPreTransformerLayer) +from vllm_ascend.multistream.metadata import MultiStreamMetadata + + +# === fixture: mock tensor input === +@pytest.fixture +def input_tensors(): + return [torch.randn(2, 128), torch.randn(2, 128)] + + +# === mock get_forward_context === +class DummyContext: + + def __init__(self, attn_metadata): + self.attn_metadata = attn_metadata + + +class TestMultiStreamPreTransformerLayer(PytestBase): + + # === test when multistream_metadata is None === + @patch("vllm_ascend.multistream.layers.get_forward_context") + @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") + def test_forward_no_multistream_metadata(self, mock_set_ctx, mock_get_ctx, + input_tensors): + mock_get_ctx.return_value = DummyContext(attn_metadata="dummy_meta") + layer = MultiStreamPreTransformerLayer(multistream_metadata=None) + attn_out, input_out = layer.forward(input_tensors) + + assert attn_out == "dummy_meta" + assert input_out == input_tensors + mock_set_ctx.assert_called_once_with(-1, None, None) + + # === test when attn_metadata is None === + @patch("vllm_ascend.multistream.layers.get_forward_context") + @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") + def test_forward_no_attn_metadata(self, mock_set_ctx, mock_get_ctx, + input_tensors): + mock_get_ctx.return_value = DummyContext(attn_metadata=None) + dummy_metadata = MagicMock(spec=MultiStreamMetadata) + layer = MultiStreamPreTransformerLayer( + multistream_metadata=dummy_metadata) + + attn_out, input_out = layer.forward(input_tensors) + + assert attn_out is None + assert input_out == input_tensors + mock_set_ctx.assert_called_once_with(-1, None, None) + + # === test when do_ms=False (no split needed) === + @patch("vllm_ascend.multistream.layers.get_forward_context") + @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") + def test_forward_no_split(self, mock_set_ctx, mock_get_ctx, input_tensors): + dummy_attn = "original_attn" + mock_get_ctx.return_value = DummyContext(attn_metadata=dummy_attn) + + dummy_metadata = MagicMock(spec=MultiStreamMetadata) + dummy_metadata.split_micro_batch.return_value = (False, "same_attn", + input_tensors, None) + + layer = MultiStreamPreTransformerLayer( + multistream_metadata=dummy_metadata) + + attn_out, input_out = layer.forward(input_tensors) + + assert attn_out == "same_attn" + assert input_out == input_tensors + mock_set_ctx.assert_called_once_with(-1, None, None) + + # === test when do_ms=True (split occurred) === + @patch("vllm_ascend.multistream.layers.get_forward_context") + @patch("vllm_ascend.multistream.layers.set_multistream_layer_context") + def test_forward_split(self, mock_set_ctx, mock_get_ctx, input_tensors): + dummy_attn = "original_attn" + mock_get_ctx.return_value = DummyContext(attn_metadata=dummy_attn) + + split_inputs = [[t[:1], t[1:]] for t in input_tensors] + + dummy_metadata = MagicMock(spec=MultiStreamMetadata) + dummy_metadata.start_layer = 2 + dummy_metadata.split_micro_batch.return_value = (True, + ["attn1", "attn2"], + split_inputs, None) + + layer = MultiStreamPreTransformerLayer( + multistream_metadata=dummy_metadata) + + attn_out, input_out = layer.forward(input_tensors) + + assert attn_out == ["attn1", "attn2"] + assert input_out == split_inputs + mock_set_ctx.assert_called_once_with(2, dummy_metadata, + ["attn1", "attn2"]) + + +class TestMultiStreamPostTransformerLayer(PytestBase): + + def test_post_forward_metadata_none(self, input_tensors): + layer = MultiStreamPostTransformerLayer(multistream_metadata=None) + output = layer.forward(input_tensors) + assert output == input_tensors + + dummy_metadata = MagicMock(spec=MultiStreamMetadata) + dummy_metadata.ms_config = None + layer = MultiStreamPostTransformerLayer( + multistream_metadata=dummy_metadata) + output = layer.forward(input_tensors) + assert output == input_tensors + + @patch("vllm_ascend.multistream.layers.get_multistream_layer_context") + @patch("vllm_ascend.multistream.layers.reset_multistream_layer_context") + def test_post_forward_normal_flow(self, mock_reset_ctx, mock_get_ctx, + input_tensors): + A_instance_of_MultiStreamMetadata = MultiStreamMetadata( + calculate_stream=MagicMock(), + communicate_stream=MagicMock(), + start_layer=0, + end_layer=1, + event_keys=[], + multistream_config=None, + ) + dummy_metadata = MagicMock(spec=A_instance_of_MultiStreamMetadata) + dummy_metadata.ms_config.num_micro_batches = 4 + dummy_metadata.end_layer = 10 + + mock_get_ctx.return_value = ( + 5, # layer_index + dummy_metadata, # ms_metadata + "dummy_attn_metadata" # ms_attn_metadata + ) + + dummy_metadata.merge_micro_batches.return_value = "merged_result" + + layer = MultiStreamPostTransformerLayer( + multistream_metadata=dummy_metadata) + output = layer.forward(input_tensors) + + # check wait_event + dummy_metadata.try_wait_event.assert_called_once_with( + 9, # end_layer - 1 + 3, # num_micro_batches - 1 + MSEventKey.FFN_AR_FINISH) + mock_reset_ctx.assert_called_once() + assert output == "merged_result" + + @patch("vllm_ascend.multistream.layers.get_multistream_layer_context") + @patch("vllm_ascend.multistream.layers.reset_multistream_layer_context") + def test_post_forward_with_custom_wait_layer(self, mock_reset_ctx, + mock_get_ctx, input_tensors): + A_instance_of_MultiStreamMetadata = MultiStreamMetadata( + calculate_stream=MagicMock(), + communicate_stream=MagicMock(), + start_layer=0, + end_layer=1, + event_keys=[], + multistream_config=None, + ) + dummy_metadata = MagicMock(spec=A_instance_of_MultiStreamMetadata) + dummy_metadata.ms_config.num_micro_batches = 4 + dummy_metadata.end_layer = 10 + + mock_get_ctx.return_value = ( + 3, # layer_index + dummy_metadata, + "dummy_attn_metadata") + + dummy_metadata.merge_micro_batches.return_value = "merged_result" + + layer = MultiStreamPostTransformerLayer( + multistream_metadata=dummy_metadata) + output = layer.forward(input_tensors, wait_layer_index=7) + + dummy_metadata.try_wait_event.assert_called_once_with( + 7, 3, MSEventKey.FFN_AR_FINISH) + mock_reset_ctx.assert_called_once() + assert output == "merged_result" diff --git a/tests/ut/multistream/test_metadata.py b/tests/ut/multistream/test_metadata.py new file mode 100644 index 0000000..79fd703 --- /dev/null +++ b/tests/ut/multistream/test_metadata.py @@ -0,0 +1,246 @@ +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.multistream.base import MSEventKey +from vllm_ascend.multistream.metadata import (MultiStreamConfig, + MultiStreamMetadata, + MultiStreamStepMetadata, + split_micro_batches_tensors) + + +class TestMetaData(TestBase): + + def setUp(self): + self.test_tensors_list = [torch.randn(100, 1024) for i in range(3)] + self.test_tensors = torch.randn(100, 1024) + self.test_tensors_dict = { + 'query': torch.randn(100, 1024), + 'key': torch.randn(100, 1024), + 'value': torch.randn(100, 1024) + } + self.split_index = 50 + + mock_stream = MagicMock(spec=torch.npu.Stream) + event_keys = [MagicMock(spec=MSEventKey)] + multistream_config = MagicMock(spec=MultiStreamConfig) + + self.metadata = MultiStreamMetadata( + calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + def test_split_micro_batches_tensors(self): + test_tensors_list_res = split_micro_batches_tensors( + self.test_tensors_list, self.split_index) + test_tensors_res = split_micro_batches_tensors(self.test_tensors, + self.split_index) + keys = ['query', 'key', 'value'] + test_tensors_dict_res = split_micro_batches_tensors( + self.test_tensors_dict, self.split_index, keys) + for i in range(3): + self.assertEqual(len(test_tensors_list_res[i][0]), + self.split_index) + + self.assertEqual( + len(test_tensors_list_res[i][0]) + + len(test_tensors_list_res[i][1]), 100) + + self.assertEqual(len(test_tensors_res[0]), self.split_index) + self.assertEqual( + len(test_tensors_res[0]) + len(test_tensors_res[1]), 100) + + for key in keys: + self.assertEqual(len(test_tensors_dict_res[0][key]), + self.split_index) + self.assertEqual( + len(test_tensors_dict_res[0][key]) + + len(test_tensors_dict_res[1][key]), 100) + + def test_default_init_multistream_step_metadata(self): + metadata = MultiStreamStepMetadata() + self.assertIsNone(metadata.comm_stream) + self.assertIsNone(metadata.before_comm_event) + self.assertIsNone(metadata.after_comm_event) + + def test_custom_init_multistream_step_metadata(self): + mockStream = MagicMock(spec=torch.npu.Stream) + mockEvent1 = MagicMock(spec=torch.npu.Event) + mockEvent2 = MagicMock(spec=torch.npu.Event) + + metadata = MultiStreamStepMetadata(mockStream, mockEvent1, mockEvent2) + self.assertEqual(metadata.comm_stream, mockStream) + self.assertEqual(metadata.before_comm_event, mockEvent1) + self.assertEqual(metadata.after_comm_event, mockEvent2) + + def test_default_init_multistream_config(self): + config = MultiStreamConfig() + self.assertEqual(config.min_total_tokens_to_split, 256) + self.assertEqual(config.min_prefill_tokens_to_split, 64) + self.assertEqual(config.num_micro_batches, 2) + self.assertEqual(config.imbalance_ratio, 0.1) + + def test_custom_init_multistream_config(self): + config = MultiStreamConfig(512, 128, 1, 0.2) + self.assertEqual(config.min_total_tokens_to_split, 512) + self.assertEqual(config.min_prefill_tokens_to_split, 128) + self.assertEqual(config.num_micro_batches, 1) + self.assertEqual(config.imbalance_ratio, 0.2) + + def test_init_multistream_metadata(self): + mock_stream = MagicMock(spec=torch.npu.Stream) + + event_keys = [MagicMock()] + multistream_config = MagicMock(spec=MultiStreamConfig) + + metadata = MultiStreamMetadata(calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + self.assertEqual(metadata.calculate_stream, mock_stream) + self.assertEqual(metadata.communicate_stream, mock_stream) + self.assertEqual(metadata.start_layer, 1) + self.assertEqual(metadata.end_layer, 3) + self.assertEqual(metadata.ms_config, multistream_config) + self.assertTrue(metadata.causal_lm) + + def test_build_events(self): + mock_stream = MagicMock(spec=torch.npu.Stream) + mock_event = MagicMock(spec=torch.npu.Event) + with patch('torch.npu.Event', return_value=mock_event): + event_keys = [MagicMock(spec=MSEventKey)] + multistream_config = MultiStreamConfig( + num_micro_batches=2, + min_total_tokens_to_split=256, + min_prefill_tokens_to_split=64) + + metadata = MultiStreamMetadata( + calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + expected_events = { + 0: { + 0: { + event_keys[0]: mock_event + }, + 1: { + event_keys[0]: mock_event + } + }, + 1: { + 0: { + event_keys[0]: mock_event + }, + 1: { + event_keys[0]: mock_event + } + }, + 2: { + 0: { + event_keys[0]: mock_event + }, + 1: { + event_keys[0]: mock_event + } + } + } + self.assertEqual(metadata.ms_events, expected_events) + + def test_build_ms_split_config(self): + mock_stream = MagicMock(spec=torch.npu.Stream) + event_keys = [MagicMock(spec=MSEventKey)] + multistream_config = MagicMock(spec=MultiStreamConfig) + multistream_config.num_micro_batches = 2 + multistream_config.min_total_tokens_to_split = 256 + multistream_config.min_prefill_tokens_to_split = 64 + + metadata = MultiStreamMetadata(calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + self.assertIsNotNone(metadata.ms_split_config) + self.assertEqual(metadata.ms_split_config.num_micro_batches, + multistream_config.num_micro_batches) + self.assertEqual(metadata.ms_split_config.min_total_tokens_to_split, + multistream_config.min_total_tokens_to_split) + self.assertEqual(metadata.ms_split_config.min_prefill_tokens_to_split, + multistream_config.min_prefill_tokens_to_split) + + def test_try_wait_event(self): + mock_stream = MagicMock(spec=torch.npu.Stream) + mock_event = MagicMock(spec=torch.npu.Event) + event_keys = [MagicMock(spec=MSEventKey)] + multistream_config = MagicMock(spec=MultiStreamConfig) + with patch('torch.npu.Event', return_value=mock_event): + metadata = MultiStreamMetadata( + calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + metadata.try_wait_event(layer_index=1, + micro_batch_index=0, + event_key=event_keys[0]) + mock_event.wait.assert_called_once() + + def test_try_record_event(self): + mock_stream = MagicMock(spec=torch.npu.Stream) + mock_event = MagicMock(spec=torch.npu.Event) + event_keys = [MagicMock(spec=MSEventKey)] + multistream_config = MagicMock(spec=MultiStreamConfig) + with patch('torch.npu.Event', return_value=mock_event): + metadata = MultiStreamMetadata( + calculate_stream=mock_stream, + communicate_stream=mock_stream, + start_layer=1, + end_layer=3, + event_keys=event_keys, + multistream_config=multistream_config) + + metadata.try_record_event(layer_index=1, + micro_batch_index=0, + event_key=event_keys[0]) + mock_event.record.assert_called_once() + + def test_merge_batches_none_input(self): + input_tensors = None + result = self.metadata.merge_micro_batches(input_tensors) + self.assertIsNone(result) + + def test_merge_batches_single_tensor_input(self): + input_tensors = [torch.tensor([1, 2, 3])] + result = self.metadata.merge_micro_batches(input_tensors) + self.assertEqual(len(result), 1) + self.assertTrue(torch.equal(result[0], torch.tensor([1, 2, 3]))) + + def test_merge_batches_list_of_tensors_input(self): + input_tensors = [torch.tensor([1, 2]), torch.tensor([3, 4])] + result = self.metadata.merge_micro_batches(input_tensors) + self.assertEqual(len(result), 2) + self.assertEqual(result, input_tensors) + + def test_merge_batches_nested_list_input(self): + input_tensors = [[torch.tensor([1, 2]), + torch.tensor([3, 4])], + [torch.tensor([5, 6]), + torch.tensor([7, 8])]] + result = self.metadata.merge_micro_batches(input_tensors) + self.assertEqual(len(result), 2) + self.assertTrue(torch.equal(result[0], torch.tensor([1, 2, 3, 4]))) + self.assertTrue(torch.equal(result[1], torch.tensor([5, 6, 7, 8]))) diff --git a/tests/ut/multistream/test_ms_split.py b/tests/ut/multistream/test_ms_split.py new file mode 100644 index 0000000..e76321a --- /dev/null +++ b/tests/ut/multistream/test_ms_split.py @@ -0,0 +1,147 @@ +from unittest.mock import MagicMock + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig +from vllm_ascend.multistream.ms_split import (compute_split_seq_index, + model_input_split_v1_mla_attn, + split_attn_int_type, + split_attn_tensor_type) + + +class TestMsSplit(TestBase): + + def test_decode_only(self): + result = compute_split_seq_index( + query_lens=None, + attn_state=AscendAttentionState.DecodeOnly, + num_tokens=10) + self.assertEqual(result, [5, 5]) + + def test_perfect_balance(self): + query_lens = [2, 3, 5] + result = compute_split_seq_index( + query_lens=query_lens, + attn_state=AscendAttentionState.PrefillNoCache, + num_tokens=10) + self.assertEqual(result, [5, 2]) + + def test_imbalance(self): + query_lens = [1, 2, 3, 4] + result = compute_split_seq_index( + query_lens=query_lens, + attn_state=AscendAttentionState.PrefillNoCache, + num_tokens=10) + self.assertEqual(result, [0, 0]) + + def test_query_lens_none(self): + with self.assertRaises(AssertionError): + compute_split_seq_index( + query_lens=None, + attn_state=AscendAttentionState.PrefillNoCache, + num_tokens=10) + + def test_empty_query_lens(self): + query_lens: list[int] = [] + result = compute_split_seq_index( + query_lens=query_lens, + attn_state=AscendAttentionState.PrefillNoCache, + num_tokens=10) + self.assertEqual(result, [0, 0]) + + def test_single_query_len(self): + query_lens = [10] + result = compute_split_seq_index( + query_lens=query_lens, + attn_state=AscendAttentionState.PrefillNoCache, + num_tokens=10) + self.assertEqual(result, [0, 0]) + + def test_split_attn_tensor_type_middle(self): + input_tensor = torch.tensor([1, 2, 3, 4, 5]) + index = 3 + expected_result = [torch.tensor([1, 2, 3]), torch.tensor([4, 5])] + result = split_attn_tensor_type(input_tensor, index) + self.assertEqual(len(result), 2) + self.assertTrue(torch.equal(result[0], expected_result[0])) + self.assertTrue(torch.equal(result[1], expected_result[1])) + + def test_split_attn_tensor_type_start(self): + input_tensor = torch.tensor([1, 2, 3, 4, 5]) + index = 0 + expected_result = [torch.tensor([]), torch.tensor([1, 2, 3, 4, 5])] + result = split_attn_tensor_type(input_tensor, index) + self.assertEqual(len(result), 2) + self.assertTrue(torch.equal(result[0], expected_result[0])) + self.assertTrue(torch.equal(result[1], expected_result[1])) + + def test_split_attn_tensor_type_end(self): + input_tensor = torch.tensor([1, 2, 3, 4, 5]) + index = 5 + expected_result = [torch.tensor([1, 2, 3, 4, 5]), torch.tensor([])] + result = split_attn_tensor_type(input_tensor, index) + self.assertEqual(len(result), 2) + self.assertTrue(torch.equal(result[0], expected_result[0])) + self.assertTrue(torch.equal(result[1], expected_result[1])) + + def test_split_attn_tensor_type_empty_tensor(self): + input_tensor = torch.tensor([]) + index = 0 + expected_result = [torch.tensor([]), torch.tensor([])] + result = split_attn_tensor_type(input_tensor, index) + self.assertEqual(len(result), 2) + self.assertTrue(torch.equal(result[0], expected_result[0])) + self.assertTrue(torch.equal(result[1], expected_result[1])) + + def test_split_attn_int_type_index_greater_than_var(self): + var = 5 + index = 10 + expected_result = [5, 0] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_attn_int_type_index_equal_to_var(self): + var = 5 + index = 5 + expected_result = [5, 0] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_attn_int_type_index_less_than_var(self): + var = 10 + index = 5 + expected_result = [5, 5] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_attn_int_type_index_zero(self): + var = 10 + index = 0 + expected_result = [0, 10] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_attn_int_type_var_zero(self): + var = 0 + index = 5 + expected_result = [0, 0] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_attn_int_type_both_zero(self): + var = 0 + index = 0 + expected_result = [0, 0] + result = split_attn_int_type(var, index) + self.assertEqual(result, expected_result) + + def test_split_v1_mla_attn_input_none(self): + attn_metadata = None + ascendMLAPrefillMetadata = MagicMock() + ms_split_config = MSAttentionMetadataSplitConfig(num_micro_batches=1) + result = model_input_split_v1_mla_attn(attn_metadata, + ascendMLAPrefillMetadata, + ms_split_config) + self.assertEqual(result, [None]) diff --git a/tests/ut/ops/expert_map.json b/tests/ut/ops/expert_map.json new file mode 100644 index 0000000..bb74799 --- /dev/null +++ b/tests/ut/ops/expert_map.json @@ -0,0 +1,17 @@ +{ + "moe_layer_count": + 1, + "layer_list": [{ + "layer_id": + 0, + "device_count": + 2, + "device_list": [{ + "device_id": 0, + "device_expert": [7, 2, 0, 3, 5] + }, { + "device_id": 1, + "device_expert": [6, 1, 4, 7, 2] + }] + }] +} diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py new file mode 100644 index 0000000..b90ccff --- /dev/null +++ b/tests/ut/ops/test_activation.py @@ -0,0 +1,61 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from unittest.mock import patch + +import pytest +import torch +from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul + + +@pytest.fixture +def dummy_tensor(): + return torch.randn(4, 8, dtype=torch.float16) + + +@patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1) +def test_QuickGELU_forward(mock_gelu, dummy_tensor): + layer = QuickGELU() + out = layer.forward(dummy_tensor) + + expected_out = dummy_tensor + 1 + assert torch.allclose(out, expected_out) + + mock_gelu.assert_called_once() + + +@pytest.mark.parametrize("is_310p_return", [True, False]) +@patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1) +def test_SiluAndMul_forward(mock_swiglu, is_310p_return, dummy_tensor): + + with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return): + layer = SiluAndMul() + out = layer.forward(dummy_tensor) + + if is_310p_return: + expected_arg = dummy_tensor.to(torch.float32) + else: + expected_arg = dummy_tensor + + # assert mock_swiglu.call_count == 1 + mock_swiglu.assert_called_once() + + actual_arg = mock_swiglu.call_args[0][0] + assert torch.allclose( + actual_arg, + expected_arg), "npu_swiglu called with unexpected input" + + expected_out = dummy_tensor + 1 + assert torch.allclose(out, expected_out) diff --git a/tests/ut/ops/test_common_fused_moe.py b/tests/ut/ops/test_common_fused_moe.py new file mode 100644 index 0000000..409a301 --- /dev/null +++ b/tests/ut/ops/test_common_fused_moe.py @@ -0,0 +1,69 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from unittest.mock import patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.ops.common_fused_moe import fused_experts_moge + + +class TestFusedExpertsMoGE(TestBase): + + def test_fused_experts_moge(self): + with patch('torch_npu.npu_grouped_matmul') as mock_grouped_matmul, \ + patch('torch_npu.npu_swiglu') as mock_swiglu, \ + patch('vllm_ascend.utils.is_310p') as mock_is_310p: + + mock_is_310p.return_value = False + + mock_grouped_matmul.side_effect = lambda x, weight, **kwargs: [ + torch.randn(x[0].shape[0], weight[0].shape[1]) + ] + + mock_swiglu.side_effect = lambda x: x + + hidden_states = torch.randn(4, 128) + w1 = torch.randn(4, 256, 128) + w2 = torch.randn(4, 128, 128) + topk_weights = torch.rand(4, 1) + topk_ids = torch.tensor([[0], [1], [2], [3]], dtype=torch.long) + top_k = 1 + global_num_experts = 4 + + moe_parallel_config = type( + 'MockConfig', (), { + 'ep_size': 1, + 'tp_size': 1, + 'dp_size': 1, + 'tp_rank': 0, + 'dp_rank': 0, + 'ep_rank': 0, + 'use_ep': True + })() + + output = fused_experts_moge( + hidden_states=hidden_states, + w1=w1, + w2=w2, + moe_parallel_config=moe_parallel_config, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + apply_router_weight_on_input=True, + ) + + self.assertEqual(output.shape, (4, 128)) diff --git a/tests/ut/ops/test_expert_load_balancer.py b/tests/ut/ops/test_expert_load_balancer.py new file mode 100644 index 0000000..97beada --- /dev/null +++ b/tests/ut/ops/test_expert_load_balancer.py @@ -0,0 +1,141 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import json +import os +from typing import List, TypedDict +from unittest import mock + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer + + +class Device(TypedDict): + device_id: int + device_expert: List[int] + + +class Layer(TypedDict): + layer_id: int + device_count: int + device_list: List[Device] + + +class MockData(TypedDict): + moe_layer_count: int + layer_list: List[Layer] + + +class TestExpertLoadBalancer(TestBase): + + def setUp(self): + _TEST_DIR = os.path.dirname(__file__) + json_file = _TEST_DIR + "/expert_map.json" + with open(json_file, 'r') as f: + self.expert_map: MockData = json.load(f) + + self.expert_load_balancer = ExpertLoadBalancer(json_file, + global_expert_num=8) + + def test_init(self): + + self.assertIsInstance(self.expert_load_balancer.expert_map_tensor, + torch.Tensor) + self.assertEqual(self.expert_load_balancer.layers_num, + self.expert_map["moe_layer_count"]) + self.assertEqual(self.expert_load_balancer.ranks_num, + self.expert_map["layer_list"][0]["device_count"]) + + def test_generate_index_dicts(self): + tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]]) + result = self.expert_load_balancer.generate_index_dicts(tensor_2d) + expected_result = [{ + 7: 0, + 2: 1, + 0: 2, + 3: 3, + 5: 4 + }, { + 6: 5, + 1: 6, + 4: 7, + 7: 8, + 2: 9 + }] + self.assertEqual(result, expected_result) + + def test_generate_expert_placement_map(self): + expert_placement_map = self.expert_load_balancer.generate_expert_placement_map( + ) + self.assertEqual(expert_placement_map.shape, + (self.expert_load_balancer.layers_num, + self.expert_load_balancer.ranks_num, 8)) + self.assertTrue(torch.all(expert_placement_map >= -1)) + + def test_generate_log2phy_expert_map(self): + layer_id = 0 + log2phy_map = self.expert_load_balancer.generate_log2phy_expert_map( + layer_id) + self.assertEqual(log2phy_map.shape, + (self.expert_load_balancer.ranks_num, 8)) + self.assertTrue(torch.all(log2phy_map >= -1)) + + @mock.patch("torch_npu.npu._lazy_init") + @mock.patch("torch.npu.current_device", return_value="cpu") + def test_get_rank_placement_map(self, mock_current_device, mock_lazy_init): + layer_id = 0 + rank_id = 0 + rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + self.assertEqual(rank_local_expert_num, 5) + expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0], + dtype=torch.int32).to( + rank_expert_map.device) + self.assertTrue(rank_expert_map.equal(expected_tensor)) + + rank_id = 1 + rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map( + layer_id, rank_id) + expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3], + dtype=torch.int32).to( + rank_expert_map.device) + self.assertTrue(rank_expert_map.equal(expected_tensor)) + + def test_get_rank_log2phy_map(self): + layer_id = 0 + rank_id = 0 + log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0], + dtype=torch.int32).to( + log2phy_map.device) + self.assertTrue(log2phy_map.equal(expected_tensor)) + + rank_id = 1 + log2phy_map = self.expert_load_balancer.get_rank_log2phy_map( + layer_id, rank_id) + expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8], + dtype=torch.int32).to( + log2phy_map.device) + self.assertTrue(log2phy_map.equal(expected_tensor)) + + def test_get_global_redundant_expert_num(self): + redundant_expert_num = self.expert_load_balancer.get_global_redundant_expert_num( + ) + expected_redundant_expert_num = len(self.expert_map["layer_list"][0]["device_list"][0]["device_expert"]) * \ + self.expert_map["layer_list"][0]["device_count"] - 8 + self.assertEqual(redundant_expert_num, expected_redundant_expert_num) diff --git a/tests/ut/ops/test_fused_ops.py b/tests/ut/ops/test_fused_ops.py new file mode 100644 index 0000000..6a51d1d --- /dev/null +++ b/tests/ut/ops/test_fused_ops.py @@ -0,0 +1,741 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from typing import List, TypedDict +from unittest.mock import MagicMock, patch + +import pytest +import torch +import torch.nn as nn +import torch_npu +from pytest_mock import MockerFixture +from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase + +import vllm_ascend.ops.moe_dispatcher.token_dispatcher as token_dispatcher_module +from tests.ut.base import TestBase +from vllm_ascend.ascend_forward_context import (FusedMoEState, + _get_fused_moe_state) +from vllm_ascend.ops.fused_moe import (AscendFusedMoE, + AscendUnquantizedFusedMoEMethod) +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp +from vllm_ascend.utils import AscendSocVersion, adapt_patch + +adapt_patch(True) + + +def mock_ep_and_mc2_group(mocker): + mock_group = mocker.MagicMock() + mock_group.rank_in_group = 0 + mock_group.rank = 0 + mock_group.world_size = 4 + mock_group.device_group = "mock_group_ep" + mock_group.all_to_all = MagicMock(return_value=torch.randn(8, 8)) + return mock_group + + +def mock_dp_and_tp_group(mocker): + mock_group = mocker.MagicMock() + mock_group.rank_in_group = 0 + mock_group.world_size = 2 + mock_group.device_group = "mock_group" + mock_group.all_gather = MagicMock(return_value=torch.randn(10, 32)) + return mock_group + + +def mock_npu_format_cast(weight_data, format): + return weight_data + + +@pytest.fixture +def mock_dist_env(mocker: MockerFixture): + mock_setup_token_dispatchers = MagicMock() + mock_token_dispatcher_with_allgather = MagicMock() + mock_token_dispatcher_with_all2allv = MagicMock() + mock_token_dispatcher_with_mc2 = MagicMock() + + mock_dispatch_result_allgather = { + "hidden_states": torch.randn(16, 2), + "group_list": torch.tensor([8, 16], dtype=torch.int64), + "group_list_type": 0, + } + mock_combine_result_allgather = torch.randn(16, 2) + + mock_token_dispatcher_with_allgather.token_dispatch.return_value = mock_dispatch_result_allgather + mock_token_dispatcher_with_allgather.token_combine.return_value = mock_combine_result_allgather + + mock_dispatch_result_all2allv = { + "hidden_states": torch.randn(16, 2), + "group_list": torch.tensor([4, 8, 12, 16], dtype=torch.int64), + "group_list_type": 1, + "dynamic_scale": None, + } + mock_combine_result_all2allv = torch.randn(16, 2) + mock_token_dispatcher_with_all2allv.token_dispatch.return_value = mock_dispatch_result_all2allv + mock_token_dispatcher_with_all2allv.token_combine.return_value = mock_combine_result_all2allv + + mock_dispatch_result_mc2 = { + "hidden_states": torch.randn(16, 2), + "group_list": torch.tensor([5, 10, 15, 16], dtype=torch.int64), + "group_list_type": 1, + "dynamic_scale": None, + "assist_info_for_combine": torch.randn(16, 2), + "ep_recv_counts": torch.tensor([4, 4, 4, 4], dtype=torch.int32), + } + mock_combine_result_mc2 = torch.randn(16, 2) + mock_token_dispatcher_with_mc2.token_dispatch.return_value = mock_dispatch_result_mc2 + mock_token_dispatcher_with_mc2.token_combine.return_value = mock_combine_result_mc2 + + captured_dispatchers = {} + + def capture_register(dispatcher_instance): + key = dispatcher_instance.__class__.__name__ + captured_dispatchers[key] = dispatcher_instance + if key == 'TokenDispatcherWithAllGather': + captured_dispatchers[key] = mock_token_dispatcher_with_allgather + elif key == 'TokenDispatcherWithAll2AllV': + captured_dispatchers[key] = mock_token_dispatcher_with_all2allv + elif key == 'TokenDispatcherWithMC2': + captured_dispatchers[key] = mock_token_dispatcher_with_mc2 + + mock_register_token_dispatcher_patcher = patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher', + side_effect=capture_register) + + mock_get_token_dispatcher_patcher = patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_token_dispatcher', + side_effect=lambda name: captured_dispatchers.get(name)) + + default_mock_token_dispatcher = mock_token_dispatcher_with_allgather + + mock_forward_context_obj = MagicMock( + fused_moe_state=FusedMoEState.AllGather, + token_dispatcher=default_mock_token_dispatcher, + max_tokens_across_dp=10, + dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10]), + mc2_mask=torch.zeros(16, dtype=torch.bool), + padded_num_tokens=16, + with_quant=False) + + with patch('torch.distributed.get_rank', return_value=0), \ + patch('torch.distributed.get_world_size', return_value=4), \ + patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \ + patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \ + patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('torch.distributed.all_gather'), \ + patch('torch.distributed.all_to_all_single'), \ + patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce'), \ + patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter'), \ + patch('vllm.model_executor.layers.fused_moe.config.get_dp_group', + return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm_ascend.ops.fused_moe.get_ascend_config', + return_value=MagicMock( + torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False), + expert_map_path=None + )), \ + patch('vllm_ascend.ops.fused_moe.determine_expert_map', + return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \ + patch('vllm_ascend.ops.fused_moe.get_forward_context', + return_value=mock_forward_context_obj), \ + patch('vllm_ascend.ops.fused_moe.get_current_vllm_config', + return_value=MagicMock( + parallel_config=MagicMock(tensor_parallel_size=2), + scheduler_config=MagicMock(max_num_seqs=4), + model_config=MagicMock(max_model_len=2048) + )), \ + patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \ + patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers), \ + patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context', + return_value=mock_forward_context_obj): + + yield { + 'mock_forward_context_obj': mock_forward_context_obj, + 'mock_token_dispatcher_with_allgather': + mock_token_dispatcher_with_allgather, + 'mock_token_dispatcher_with_all2allv': + mock_token_dispatcher_with_all2allv, + 'mock_token_dispatcher_with_mc2': mock_token_dispatcher_with_mc2, + } + + mock_register_token_dispatcher_patcher.stop() + mock_get_token_dispatcher_patcher.stop() + + +@pytest.fixture +def mock_moe_env(mocker: MockerFixture): + + with patch('torch_npu.npu_moe_gating_top_k', return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + None + )), \ + patch('torch_npu.npu_moe_init_routing', return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + torch.tensor([0, 1, 2, 4, 6, 2, 7, 1]) + )), \ + patch("torch_npu.npu_moe_compute_expert_tokens", return_value=( + torch.randn(8, 2) + )), \ + patch("torch_npu.npu_moe_distribute_dispatch", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_moe_distribute_combine", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_grouped_matmul", return_value=( + [torch.randn(16, 2)] + )), \ + patch("torch_npu.npu_swiglu", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_moe_gating_top_k_softmax", return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + torch.tensor([0, 1, 2, 4, 6, 2, 7, 1]) + )), \ + patch("torch_npu.npu_moe_finalize_routing", return_value=( + torch.randn(16, 2) + )): + if hasattr(torch_npu, 'npu_moe_distribute_dispatch_v2'): + with patch("torch_npu.npu_moe_distribute_dispatch_v2", return_value=( + torch.randn(16, 2))), \ + patch("torch_npu.npu_moe_distribute_combine_v2", return_value=( + torch.randn(16, 2))): + yield + else: + yield + + +@pytest.fixture +def default_moe_config(): + return { + 'num_experts': 8, + 'top_k': 2, + 'hidden_size': 512, + 'intermediate_size': 1024 + } + + +@pytest.fixture +def moe_method(mock_dist_env): + moe = MagicMock() + moe.moe_parallel_config.return_value = MagicMock(ep_size=4) + return AscendUnquantizedFusedMoEMethod(moe) + + +class Device(TypedDict): + device_id: int + device_expert: List[int] + + +class Layer(TypedDict): + layer_id: int + device_count: int + device_list: List[Device] + + +class MockData(TypedDict): + moe_layer_count: int + layer_list: List[Layer] + + +class MockQuantMethod(nn.Module): + + def __init__(self, shared_experts, num_tokens): + super().__init__() + if shared_experts: + self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32), + torch.randn(num_tokens, 10))) + else: + self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32))) + + +class MockFusedMoEMethod(FusedMoEMethodBase): + moe = MagicMock() + + def __init__(self): + super().__init__(self.moe) + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + pass + + def apply(self, hidden_states: torch.Tensor, + expert_weights: torch.Tensor) -> torch.Tensor: + pass + + +class TestAscendFusedMoe: + + def test_init_no_quant(self, mock_dist_env, default_moe_config): + layer = AscendFusedMoE(**default_moe_config) + + layer.w13_weight = nn.Parameter( + torch.randn(default_moe_config['num_experts'], + default_moe_config['intermediate_size'] * 2, + default_moe_config['hidden_size'])) + layer.w2_weight = nn.Parameter( + torch.randn(default_moe_config['num_experts'], + default_moe_config['hidden_size'], + default_moe_config['intermediate_size'])) + + assert layer.num_experts == default_moe_config['num_experts'] + assert layer.top_k == default_moe_config['top_k'] + assert hasattr(layer, 'w13_weight') + assert hasattr(layer, 'w2_weight') + + with pytest.raises(AssertionError): + error_config = default_moe_config.copy() + error_config['use_grouped_topk'] = True + layer = AscendFusedMoE(**error_config) + + with pytest.raises(ValueError): + error_config = default_moe_config.copy() + error_config['scoring_func'] = "random" + layer = AscendFusedMoE(**error_config) + + def test_init_with_quant(self, mock_dist_env, default_moe_config): + mock_quant_config = MagicMock() + mock_quant_method = MockFusedMoEMethod() + mock_quant_config.get_quant_method.return_value = mock_quant_method + + moe = AscendFusedMoE(**default_moe_config, + quant_config=mock_quant_config) + + assert moe.quant_method is not None + assert moe.quant_method == mock_quant_method + + @pytest.mark.parametrize( + "others_param", + [[None, + MagicMock(return_value=torch.randn(5, 32)), False, 5, None], + [2, None, False, 5, None], [None, None, True, 5, None], + [None, None, False, 1, None], [None, None, True, 5, 1], + [None, None, False, 5, 1]]) + def test_forward(self, mock_dist_env, default_moe_config, others_param): + + top_k, shared_experts, is_prefill, num_tokens, ep_size = others_param + inputs = torch.randn(num_tokens, 32) + router_logits = torch.randn(num_tokens, 8) + moe = AscendFusedMoE(**default_moe_config) + + if ep_size == 1: + moe.moe_parallel_config.ep_size = 1 + + moe.quant_method = MockQuantMethod(shared_experts, num_tokens) + forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens, + dtype=torch.bool), + padded_num_tokens=num_tokens) + with patch("vllm_ascend.ops.fused_moe.get_forward_context", + return_value=forward_context): + output = moe.forward(inputs, + router_logits, + is_prefill=is_prefill, + top_k=top_k, + shared_experts=shared_experts) + + moe.quant_method.apply.assert_called_once() + + if shared_experts: + assert output[0].shape == (num_tokens, 32) + assert output[1].shape == (num_tokens, 10) + else: + assert output.shape == (num_tokens, 32) + + def test_forward_ms_fused_moe_comp(self, mock_dist_env, + default_moe_config): + inputs = torch.randn(5, 32) + router_logits = torch.randn(5, 8) + moe = AscendFusedMoE(**default_moe_config) + + moe.quant_method = MockQuantMethod(None, 5) + output = moe._forward_ms_fused_moe_comp(inputs, + router_logits, + is_prefill=False, + real_top_k=1) + + moe.quant_method.apply.assert_called_once() + + assert output.shape == (5, 32) + + +class TestAscendUnquantizedFusedMoEMethod: + + def test_process_weights_after_loading(self, moe_method, mock_dist_env): + layer = MagicMock() + layer.w13_weight.data = torch.randn(16, 32) + layer.w2_weight.data = torch.randn(16, 32) + + with patch('torch_npu.npu_format_cast', mock_npu_format_cast), \ + patch('vllm_ascend.utils.is_310p', return_value=False): + moe_method.process_weights_after_loading(layer) + + assert isinstance(layer.w13_weight, torch.nn.Parameter) + assert isinstance(layer.w2_weight, torch.nn.Parameter) + assert not layer.w13_weight.requires_grad + assert not layer.w2_weight.requires_grad + + @pytest.mark.parametrize("others_param", + [[256, 4], [128, 1], [128, 1], [128, 4]]) + def test_apply_without_expert_map(self, moe_method, mock_dist_env, + mock_moe_env, others_param): + + global_num_experts, ep_size = others_param + is_prefill = False + is_deepseek_v3_r1 = global_num_experts == 256 + + if ep_size == 1: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_allgather'] + elif ep_size < 16: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_all2allv'] + else: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_mc2'] + + forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( + ep_size, is_prefill, is_deepseek_v3_r1), + with_quant=False, + token_dispatcher=selected_token_dispatcher) + + with patch("vllm_ascend.ops.fused_moe.get_forward_context", + return_value=forward_context): + moe_method.ep_size = ep_size + x = torch.randn(8, 2, 2) + router_logits = torch.randn(8, 8) + layer = MagicMock() + local_num_experts = 2 + hidden_size = 2 + intermediate_size_per_partition = 4 + + layer.w13_weight = torch.randn(local_num_experts, + intermediate_size_per_partition * 2, + hidden_size) + layer.w2_weight = torch.randn(local_num_experts, hidden_size, + intermediate_size_per_partition) + + result = moe_method.apply(layer=layer, + x=x, + router_logits=router_logits, + top_k=2, + renormalize=True, + global_num_experts=global_num_experts, + is_prefill=is_prefill) + + expected_shape = (16, 2) + + assert result.shape == expected_shape + + @pytest.mark.parametrize("others_param", [16, 1, 4]) + def test_apply_with_expert_map(self, moe_method, mock_dist_env, + mock_moe_env, others_param): + + ep_size = others_param + is_prefill = False + + if ep_size == 1: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_allgather'] + elif ep_size < 16: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_all2allv'] + else: + selected_token_dispatcher = mock_dist_env[ + 'mock_token_dispatcher_with_mc2'] + + forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( + ep_size, is_prefill, True), + with_quant=False, + token_dispatcher=selected_token_dispatcher) + + with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \ + patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3): + + expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]) + moe_method.ep_size = ep_size + x = torch.randn(8, 2, 2) + if ep_size == 1: + x = x.view(-1, 2) + router_logits = torch.randn(8, 8) + layer = MagicMock() + + local_num_experts = 2 + hidden_size = 2 + intermediate_size_per_partition = 4 + layer.w13_weight = torch.randn(local_num_experts, + intermediate_size_per_partition * 2, + hidden_size) + layer.w2_weight = torch.randn(local_num_experts, hidden_size, + intermediate_size_per_partition) + + result = moe_method.apply(layer=layer, + x=x, + router_logits=router_logits, + top_k=2, + renormalize=True, + global_num_experts=128, + expert_map=expert_map, + is_prefill=is_prefill) + + expected_shape = (16, 2) + + assert result.shape == expected_shape + + +class TestExpertsSelector: + + @pytest.mark.parametrize("global_num_experts", [[256], [128]]) + def test_select_experts(self, mock_dist_env, mock_moe_env, + global_num_experts): + + x = torch.randn(8, 2) + router_logits = torch.randn(8, 2) + topk_weights, topk_ids, _ = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=2, + use_grouped_topk=False, + renormalize=True, + topk_group=None, + num_expert_group=None, + custom_routing_function=None, + scoring_func="softmax", + e_score_correction_bias=None, + global_num_experts=global_num_experts) + + assert topk_weights.shape == (8, 2) + assert topk_ids.shape == (8, 2) + + +class TestUnifiedApplyMLP(TestBase): + + @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context') + @patch('vllm_ascend.ops.layers.moe_mlp.is_310p') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_dynamic_quant') + @patch('torch_npu.npu_dequant_swiglu_quant') + def test_unified_apply_mlp_with_quantization_mc2(self, mock_npu_dequant, + mock_npu_dynamic_quant, + mock_npu_grouped_matmul, + mock_is_310p, + mock_get_forward_context): + + mock_forward_context = MagicMock() + mock_forward_context.fused_moe_state = FusedMoEState.MC2 + mock_get_forward_context.return_value = mock_forward_context + + mock_is_310p.return_value = False + + mock_npu_dynamic_quant.return_value = (torch.randint(-128, + 127, (10, 20), + dtype=torch.int8), + torch.rand(10, + 1, + dtype=torch.float32)) + + mock_npu_grouped_matmul.side_effect = [[ + torch.randint(-2147483648, 2147483647, (10, 40), dtype=torch.int32) + ], [torch.randn(10, 20, dtype=torch.bfloat16)]] + + mock_npu_dequant.return_value = (torch.randn(10, + 40, + dtype=torch.bfloat16), + torch.randn(10, + 1, + dtype=torch.float32)) + + hidden_states = torch.randn(10, 20, dtype=torch.bfloat16) + w1 = torch.randint(-128, 127, (5, 20, 40), dtype=torch.int8) + w1_scale = torch.randn(5, 40, dtype=torch.float32) + w2 = torch.randint(-128, 127, (5, 40, 20), dtype=torch.int8) + w2_scale = torch.randn(5, 20, dtype=torch.bfloat16) + group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64) + + result = unified_apply_mlp(hidden_states=hidden_states, + w1=w1, + w1_scale=w1_scale, + w2=w2, + w2_scale=w2_scale, + group_list=group_list, + dynamic_scale=None, + group_list_type=1, + w1_scale_bias=None, + w2_scale_bias=None, + topk_scales=None, + with_quant=True) + + mock_get_forward_context.assert_called() + self.assertEqual(mock_forward_context.fused_moe_state, + FusedMoEState.MC2) + + mock_npu_dynamic_quant.assert_called() + + self.assertEqual(mock_npu_grouped_matmul.call_count, 2) + + mock_npu_dequant.assert_called_once() + + self.assertEqual(result.dtype, torch.bfloat16) + + @patch('vllm_ascend.ops.layers.moe_mlp.is_310p') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_swiglu') + @patch('torch_npu.npu_dynamic_quant') + def test_unified_apply_mlp_without_quantization(self, + mock_npu_dynamic_quant, + mock_npu_swiglu, + mock_npu_grouped_matmul, + mock_is_310p): + mock_is_310p.return_value = False + + mock_npu_grouped_matmul.side_effect = [[ + torch.randn(10, 40, dtype=torch.float16) + ], [torch.randn(10, 20, dtype=torch.float16)]] + mock_npu_swiglu.return_value = torch.randn(10, 40, dtype=torch.float16) + mock_npu_dynamic_quant.return_value = (MagicMock(), MagicMock()) + + hidden_states = torch.randn(10, 20, dtype=torch.float16) + w1 = torch.randn(5, 20, 40, dtype=torch.float16) + w2 = torch.randn(5, 40, 20, dtype=torch.float16) + group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64) + topk_scales = torch.randn(10, 1, dtype=torch.float16) + + result = unified_apply_mlp(hidden_states=hidden_states, + w1=w1, + w1_scale=None, + w2=w2, + w2_scale=None, + group_list=group_list, + dynamic_scale=None, + group_list_type=1, + w1_scale_bias=None, + w2_scale_bias=None, + topk_scales=topk_scales, + with_quant=False) + + self.assertEqual(mock_npu_grouped_matmul.call_count, 2) + mock_npu_swiglu.assert_called_once() + + self.assertEqual(result.shape, hidden_states.shape) + self.assertEqual(result.dtype, torch.float16) + + @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_swiglu') + @patch('torch_npu.npu_dynamic_quant') + def test_unified_apply_mlp_with_quantization_and_dynamic_scale( + self, mock_npu_dynamic_quant, mock_npu_swiglu, + mock_npu_grouped_matmul, mock_get_forward_context): + + mock_forward_context = MagicMock() + mock_forward_context.with_quant = True + mock_forward_context.fused_moe_state = "NOT_MC2" + mock_get_forward_context.return_value = mock_forward_context + + mock_npu_grouped_matmul.side_effect = [[ + torch.randn(10, 40, dtype=torch.bfloat16) + ], [torch.randn(10, 20, dtype=torch.bfloat16)]] + + mock_npu_swiglu.return_value = torch.randn(10, + 40, + dtype=torch.bfloat16) + + mock_npu_dynamic_quant.return_value = (torch.randint(-128, + 127, (10, 40), + dtype=torch.int8), + torch.rand(10, + 1, + dtype=torch.float32)) + + hidden_states = torch.randn(10, 20, dtype=torch.bfloat16) + w1 = torch.randn(5, 20, 40, dtype=torch.bfloat16) + w1_scale = torch.randn(5, 40, dtype=torch.bfloat16) + w2 = torch.randn(5, 40, 20, dtype=torch.bfloat16) + w2_scale = torch.randn(5, 20, dtype=torch.bfloat16) + w1_scale_bias = torch.randn(5, 40, dtype=torch.bfloat16) + w2_scale_bias = torch.randn(5, 20, dtype=torch.bfloat16) + group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64) + provided_dynamic_scale = torch.rand(10, 1, dtype=torch.float32) + + result = unified_apply_mlp(hidden_states=hidden_states, + w1=w1, + w1_scale=w1_scale, + w2=w2, + w2_scale=w2_scale, + group_list=group_list, + dynamic_scale=provided_dynamic_scale, + group_list_type=1, + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias, + topk_scales=None, + with_quant=True) + + mock_get_forward_context.assert_called() + + self.assertEqual(mock_npu_grouped_matmul.call_count, 2) + mock_npu_swiglu.assert_called_once() + mock_npu_dynamic_quant.assert_called_once() + + self.assertEqual(result.shape, hidden_states.shape) + self.assertEqual(result.dtype, torch.bfloat16) + + @patch('vllm_ascend.ops.layers.moe_mlp.is_310p') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_swiglu') + @patch('torch_npu.npu_dynamic_quant') + def test_unified_apply_mlp_without_quantization_310p( + self, mock_npu_dynamic_quant, mock_npu_swiglu, + mock_npu_grouped_matmul, mock_is_310p): + mock_is_310p.return_value = True + + mock_gmm1_out = torch.randn(10, 40, dtype=torch.float16) + mock_gmm2_out = torch.randn(10, 20, dtype=torch.float16) + mock_npu_grouped_matmul.side_effect = [[mock_gmm1_out], + [mock_gmm2_out]] + + mock_npu_swiglu.return_value = torch.randn(10, 40, dtype=torch.float16) + + mock_npu_dynamic_quant.return_value = (MagicMock(), MagicMock()) + + hidden_states = torch.randn(10, 20, dtype=torch.float16) + w1 = torch.randn(5, 20, 40, dtype=torch.float16) + w2 = torch.randn(5, 40, 20, dtype=torch.float16) + group_list = torch.tensor([2, 4, 6, 8, 10], dtype=torch.int64) + topk_scales = torch.randn(10, 1, dtype=torch.float16) + + result = unified_apply_mlp(hidden_states=hidden_states, + w1=w1, + w1_scale=None, + w2=w2, + w2_scale=None, + group_list=group_list, + dynamic_scale=None, + group_list_type=1, + w1_scale_bias=None, + w2_scale_bias=None, + topk_scales=topk_scales, + with_quant=False) + + mock_is_310p.assert_called_once() + + self.assertEqual(mock_npu_grouped_matmul.call_count, 2) + mock_npu_swiglu.assert_called_once() + + self.assertEqual(result.shape, hidden_states.shape) + self.assertEqual(result.dtype, torch.float16) diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py new file mode 100644 index 0000000..c7bc657 --- /dev/null +++ b/tests/ut/ops/test_layernorm.py @@ -0,0 +1,53 @@ +from unittest.mock import patch + +import pytest +import torch +from vllm.model_executor.layers.layernorm import RMSNorm + + +@pytest.fixture +def dummy_tensor(): + return torch.randn(4, 8, dtype=torch.float16) + + +def mock_rms_norm(x, weight, eps): + return x + 1, None + + +def mock_add_rms_norm(x, residual, weight, eps): + return 2 * x, None, 2 * residual + + +@pytest.mark.parametrize("is_310p_return", [True, False]) +@pytest.mark.parametrize("residual", + [None, torch.randn(4, 8, dtype=torch.float32)]) +@patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) +@patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm) +def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p_return, + residual, dummy_tensor): + + with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return): + layer = RMSNorm(hidden_size=32, eps=1e-05) + if residual is not None: + out_x, out_residual = layer.forward_oot(dummy_tensor, residual) + + if is_310p_return: + expected_arg_x = dummy_tensor + residual.to(dummy_tensor.dtype) + expected_out_x = expected_arg_x + 1 + expected_out_residual = expected_arg_x.to(residual.dtype) + + mock_rmsnorm.assert_called_once() + assert torch.allclose(out_x, expected_out_x) + assert torch.allclose(out_residual, expected_out_residual) + else: + expected_out_x = 2 * dummy_tensor + expected_out_residual = 2 * residual + mock_add_rmsnorm.assert_called_once() + assert torch.allclose(out_x, expected_out_x) + assert torch.allclose(out_residual, expected_out_residual) + else: + out_x = layer.forward(dummy_tensor, residual) + expected_out_x = dummy_tensor + 1 + + mock_rmsnorm.assert_called_once() + assert torch.allclose(out_x, expected_out_x) diff --git a/tests/ut/ops/test_linear.py b/tests/ut/ops/test_linear.py new file mode 100644 index 0000000..28b26b7 --- /dev/null +++ b/tests/ut/ops/test_linear.py @@ -0,0 +1,363 @@ +import os +import unittest +from unittest import mock + +import torch + +from vllm_ascend.ops.linear import (AscendMlpColumnParallelLinear, + AscendMlpMergedColumnParallelLinear, + AscendMlpRowParallelLinear, LinearBase, + QuantizationConfig) + + +class TestAscendMlpRowParallelLinear(unittest.TestCase): + + def setUp(self): + os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1" + self.tensor_parallel_world_size = 2 + self.tensor_parallel_rank = 0 + self.mlp_tensor_parallel_world_size = 2 + self.mlp_tensor_parallel_rank = 1 + + self.get_tensor_model_parallel_world_size_patch = mock.patch( + 'vllm_ascend.ops.linear.get_tensor_model_parallel_world_size', + return_value=self.tensor_parallel_world_size) + self.get_tensor_model_parallel_rank_patch = mock.patch( + 'vllm_ascend.ops.linear.get_tensor_model_parallel_rank', + return_value=self.tensor_parallel_rank) + self.get_mlp_tensor_model_parallel_world_size_patch = mock.patch( + 'vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_world_size', + return_value=self.mlp_tensor_parallel_world_size) + self.get_mlp_tensor_model_parallel_rank_patch = mock.patch( + 'vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_rank', + return_value=self.mlp_tensor_parallel_rank) + + self.get_tensor_model_parallel_world_size_mock = \ + self.get_tensor_model_parallel_world_size_patch.start() + self.get_tensor_model_parallel_rank_mock = \ + self.get_tensor_model_parallel_rank_patch.start() + self.get_mlp_tensor_model_parallel_world_size_mock = \ + self.get_mlp_tensor_model_parallel_world_size_patch.start() + self.get_mlp_tensor_model_parallel_rank_mock = \ + self.get_mlp_tensor_model_parallel_rank_patch.start() + + self.split_tensor_along_last_dim_patch = mock.patch( + 'vllm_ascend.ops.linear.split_tensor_along_last_dim', + return_value=(torch.randn(10, 8), torch.randn(10, 8))) + self.tensor_model_parallel_all_reduce_patch = mock.patch( + 'vllm_ascend.ops.linear.tensor_model_parallel_all_reduce', + return_value=torch.randn(10, 8)) + self.tensor_model_parallel_all_reduce_mock = \ + self.tensor_model_parallel_all_reduce_patch.start() + self.split_tensor_along_last_dim_mock = \ + self.split_tensor_along_last_dim_patch.start() + self.get_mlp_tp_group_patch = \ + mock.patch('vllm_ascend.ops.linear.get_mlp_tp_group') + self.get_mlp_tp_group_mock = self.get_mlp_tp_group_patch.start() + self.get_mlp_tp_group_mock.return_value = mock.MagicMock() + self.get_mlp_tp_group_mock.return_value.reduce_scatter = \ + mock.MagicMock() + + def tearDown(self): + self.get_tensor_model_parallel_world_size_patch.stop() + self.get_tensor_model_parallel_rank_patch.stop() + self.get_mlp_tensor_model_parallel_world_size_patch.stop() + self.get_mlp_tensor_model_parallel_rank_patch.stop() + self.split_tensor_along_last_dim_patch.stop() + self.tensor_model_parallel_all_reduce_patch.stop() + self.get_mlp_tp_group_patch.stop() + + def test_init_with_down_proj_prefix(self): + layer = AscendMlpRowParallelLinear(input_size=16, + output_size=8, + prefix="down_proj") + self.assertEqual(layer.tp_size, self.mlp_tensor_parallel_world_size) + self.assertEqual(layer.tp_rank, self.mlp_tensor_parallel_rank) + self.assertTrue(layer.enable_mlp_optimze) + + def test_forward_with_mlp_optimize(self): + layer = AscendMlpRowParallelLinear( + input_size=16, + output_size=8, + prefix="down_proj", + input_is_parallel=False, + ) + input_tensor = torch.randn(16, 8) # (batch_size, input_size) + layer(input_tensor) + + self.split_tensor_along_last_dim_mock.assert_called_once_with( + input_tensor, num_partitions=layer.tp_size) + + def test_forward_without_mlp_optimize(self): + layer = AscendMlpRowParallelLinear( + input_size=16, + output_size=8, + prefix="other", + input_is_parallel=False, + ) + input_tensor = torch.randn(16, 8) + layer(input_tensor) + + self.split_tensor_along_last_dim_mock.assert_called_once_with( + input_tensor, num_partitions=layer.tp_size) + self.tensor_model_parallel_all_reduce_mock.assert_called_once() + + def test_skip_bias_add(self): + layer = AscendMlpRowParallelLinear( + input_size=16, + output_size=8, + skip_bias_add=True, + ) + input_tensor = torch.randn(16, 8) + output, bias = layer(input_tensor) + + self.assertIsNotNone(bias) + + def test_no_reduce_results(self): + layer = AscendMlpRowParallelLinear(input_size=16, + output_size=8, + reduce_results=False, + bias=False) + input_tensor = torch.randn(16, 8) + layer(input_tensor) + + self.tensor_model_parallel_all_reduce_mock.assert_not_called() + + def test_input_not_parallel(self): + layer = AscendMlpRowParallelLinear(input_size=16, + output_size=8, + input_is_parallel=False) + input_tensor = torch.randn(16, 8) + layer(input_tensor) + + self.split_tensor_along_last_dim_mock.assert_called_once() + + def test_exception_when_reduce_false_and_bias(self): + with self.assertRaises(ValueError): + AscendMlpRowParallelLinear(input_size=16, + output_size=8, + reduce_results=False, + bias=True, + skip_bias_add=False) + + +class TestAscendMlpColumnParallelLinear(unittest.TestCase): + + def setUp(self): + os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1" + # Mock distributed functions + self.mlp_tp_size_patch = \ + mock.patch('vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_world_size') + self.mlp_tp_size_mock = self.mlp_tp_size_patch.start() + self.mlp_tp_size_mock.return_value = 2 # Simulate 2 GPUs in MLP TP group + + self.mlp_tp_rank_patch = \ + mock.patch('vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_rank') + self.mlp_tp_rank_mock = self.mlp_tp_rank_patch.start() + self.mlp_tp_rank_mock.return_value = 0 # Current GPU rank + + self.tp_size_patch = \ + mock.patch('vllm_ascend.ops.linear.get_tensor_model_parallel_world_size') + self.tp_size_mock = self.tp_size_patch.start() + self.tp_size_mock.return_value = 4 # Simulate 4 GPUs in regular TP group + + self.tp_rank_patch = \ + mock.patch('vllm_ascend.ops.linear.get_tensor_model_parallel_rank') + self.tp_rank_mock = self.tp_rank_patch.start() + self.tp_rank_mock.return_value = 1 # Current GPU rank + + # Mock divide function (assumed to be in your module) + self.divide_patch = mock.patch('vllm_ascend.ops.linear.divide') + self.divide_mock = self.divide_patch.start() + self.divide_mock.side_effect = lambda x, y: x // y # Simulate division + + # Mock QuantizationConfig and QuantMethod + self.quant_config_mock = mock.MagicMock(spec=QuantizationConfig) + + # Mock LinearBase initialization + self.linear_base_init_patch = mock.patch.object( + LinearBase, "__init__", side_effect=self.mock_linear_base_init) + self.linear_base_init_patch.start() + + self.quant_method_mock = mock.MagicMock() + + def mock_linear_base_init(self, instance, *args, **kwargs): + instance.quant_method = self.quant_method_mock + instance.params_dtype = mock.MagicMock() + + instance.input_size = 16 + instance.output_size = 8 + instance.output_size_per_partition = 4 + instance.params_dtype = torch.float32 + + def tearDown(self): + self.mlp_tp_size_patch.stop() + self.mlp_tp_rank_patch.stop() + self.tp_size_patch.stop() + self.tp_rank_patch.stop() + self.divide_patch.stop() + self.linear_base_init_patch.stop() + + def test_mlp_optimize_initialization(self): + # Test when prefix contains "gate_up_proj" + with mock.patch.object(torch.nn.Module, 'register_parameter'): + layer = AscendMlpColumnParallelLinear( + input_size=16, + output_size=8, + prefix="model.layers.0.gate_up_proj", + bias=False, + ) + + # Verify MLP optimization flags + self.assertTrue(layer.enable_mlp_optimze) + self.assertEqual(layer.tp_size, 2) + self.assertEqual(layer.tp_rank, 0) + self.assertEqual(layer.input_size_per_partition, 16) + self.assertEqual(layer.output_size_per_partition, 4) + + # Check quant_method.create_weights was called + self.quant_method_mock.create_weights.assert_called_once() + + def test_regular_parallel_initialization(self): + # Test when prefix does NOT contain "gate_up_proj" + with mock.patch.object(torch.nn.Module, 'register_parameter'): + layer = AscendMlpColumnParallelLinear( + input_size=16, + output_size=8, + prefix="model.layers.0.q_proj", + quant_config=self.quant_config_mock, + bias=False, + ) + + # Verify regular TP flags + self.assertFalse(layer.enable_mlp_optimze) + self.assertEqual(layer.tp_size, 4) + self.assertEqual(layer.tp_rank, 1) + self.assertEqual(layer.input_size_per_partition, 16) + self.assertEqual(layer.output_size_per_partition, 4) + # Check quant_method.create_weights was called + self.quant_method_mock.create_weights.assert_called_once() + + def test_output_sizes_handling(self): + # Test when output_sizes is provided + with mock.patch.object(torch.nn.Module, 'register_parameter'): + layer = AscendMlpColumnParallelLinear( + input_size=16, + output_size=8, + output_sizes=[4, 4], + prefix="model.layers.0.qkv_proj", + quant_config=self.quant_config_mock, + bias=False, + ) + + # Verify output_partition_sizes + self.assertEqual(layer.output_partition_sizes, [2]) + + +class TestAscendMlpMergedColumnParallelLinear(unittest.TestCase): + + def setUp(self): + os.environ["VLLM_ASCEND_ENABLE_MLP_OPTIMIZE"] = "1" + # Mock get_mlp_tensor_model_parallel_world_size and get_tensor_model_parallel_world_size + self.mlp_world_size_patch = \ + mock.patch("vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_world_size", return_value=2) + self.tensor_world_size_patch = \ + mock.patch("vllm_ascend.ops.linear.get_tensor_model_parallel_world_size", return_value=2) + self.mlp_world_size_patch.start() + self.tensor_world_size_patch.start() + + # Mock get_mlp_tensor_model_parallel_rank and get_tensor_model_parallel_rank + self.mlp_rank_patch = \ + mock.patch("vllm_ascend.ops.linear.get_mlp_tensor_model_parallel_rank", return_value=0) + self.tensor_rank_patch = \ + mock.patch("vllm_ascend.ops.linear.get_tensor_model_parallel_rank", return_value=0) + self.mlp_rank_patch.start() + self.tensor_rank_patch.start() + + # Mock all_gather methods + self.get_mlp_tp_group_patch = \ + mock.patch('vllm_ascend.ops.linear.get_mlp_tp_group') + self.get_mlp_tp_group_mock = self.get_mlp_tp_group_patch.start() + self.get_mlp_tp_group_mock.return_value = mock.MagicMock() + self.get_mlp_tp_group_mock.return_value.all_gather = mock.MagicMock() + self.tensor_model_parallel_all_gather_patch = mock.patch( + 'vllm_ascend.ops.linear.tensor_model_parallel_all_gather', + return_value=torch.randn(10, 8)) + self.tensor_model_parallel_all_gather_mock = \ + self.tensor_model_parallel_all_gather_patch.start() + + # Mock AscendMlpColumnParallelLinear's __init__ + self.linear_init_patch = mock.patch.object( + AscendMlpColumnParallelLinear, + "__init__", + side_effect=self.mock_linear_init) + self.linear_init_patch.start() + + # Create mock objects + self.quant_method_mock = mock.MagicMock() + self.apply_output = torch.randn(2, 8) + + self.quant_method_mock.apply.return_value = self.apply_output + + def mock_linear_init(self, instance, *args, **kwargs): + torch.nn.Module.__init__(instance) + # Set quant_method and other attributes + instance.quant_method = self.quant_method_mock + instance.bias = torch.nn.Parameter(torch.randn(8)) # Example bias + instance.input_size = 16 + instance.output_size = 8 + instance.gather_output = False + instance.skip_bias_add = False + instance.return_bias = True + + def test_forward_with_enable_mlp_optimze(self): + # Setup input + input_tensor = torch.randn(1, 16) + + # Create instance with prefix "gate_up_proj" to trigger enable_mlp_optimze = True + layer = AscendMlpMergedColumnParallelLinear(input_size=16, + output_sizes=[8], + bias=True, + gather_output=False, + skip_bias_add=False, + params_dtype=torch.float32, + quant_config=None, + prefix="other_proj") + + # Call forward + output, bias = layer(input_tensor) + + # Validate calls + self.assertEqual(output.shape, self.apply_output.shape) + + def test_forward_without_enable_mlp_optimze(self): + # Setup input + input_tensor = torch.randn(1, 16) + + # Create instance with prefix not containing "gate_up_proj" + layer = AscendMlpMergedColumnParallelLinear(input_size=16, + output_sizes=[8], + bias=True, + gather_output=False, + skip_bias_add=False, + params_dtype=torch.float32, + quant_config=None, + prefix="other_proj") + + # Call forward + output, bias = layer(input_tensor) + + # Validate calls + self.quant_method_mock.apply.assert_called_once_with( + layer, input_tensor, layer.bias) + self.tensor_model_parallel_all_gather_mock.assert_not_called() + self.assertEqual(output.shape, self.apply_output.shape) + + def tearDown(self): + self.linear_init_patch.stop() + self.mlp_world_size_patch.stop() + self.tensor_world_size_patch.stop() + self.mlp_rank_patch.stop() + self.tensor_rank_patch.stop() + self.get_mlp_tp_group_mock.stop() + self.tensor_model_parallel_all_gather_mock.stop() diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py new file mode 100644 index 0000000..eb48c81 --- /dev/null +++ b/tests/ut/ops/test_rotary_embedding.py @@ -0,0 +1,318 @@ +import math +import unittest +from unittest.mock import MagicMock, PropertyMock, patch + +import torch +from vllm.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, RotaryEmbedding) + +from tests.ut.base import TestBase +from vllm_ascend.ops.rotary_embedding import _custom_rotary_embedding_enabled + + +class TestCustomRotaryEmbeddingEnabled(unittest.TestCase): + + def setUp(self): + # Common setup for tests + self.positions = torch.tensor([1, 2, 3]) + self.query = torch.randn(3, 4, dtype=torch.float16) + self.key = torch.randn(3, 4, dtype=torch.float16) + self.head_size = 32 + self.cos_sin_cache = torch.randn(3, 4) + + # Mock self object for rope_forward_oot + self.mock_self = MagicMock() + self.mock_self.head_size = self.head_size + self.mock_self.cos_sin_cache = self.cos_sin_cache + self.mock_self.is_neox_style = True + self.mock_self.forward_native.return_value = (self.query, self.key) + + def test_custom_rotary_embedding_enabled(self): + # Test when all conditions are True + with patch('vllm_ascend.ops.rotary_embedding.enable_custom_op', + return_value=True): + result = _custom_rotary_embedding_enabled(self.query, True, + self.head_size) + self.assertTrue(result) + + # Test when dtype is not float16 + with patch('vllm_ascend.ops.rotary_embedding.enable_custom_op', + return_value=True): + query = self.query.to(torch.float32) + result = _custom_rotary_embedding_enabled(query, True, + self.head_size) + self.assertFalse(result) + + # Test when neox_style is False + with patch('vllm_ascend.ops.rotary_embedding.enable_custom_op', + return_value=True): + result = _custom_rotary_embedding_enabled(self.query, False, + self.head_size) + self.assertFalse(result) + + # Test when head_size is not divisible by 32 + with patch('vllm_ascend.ops.rotary_embedding.enable_custom_op', + return_value=True): + result = _custom_rotary_embedding_enabled(self.query, True, + self.head_size + 1) + self.assertFalse(result) + + # Test when custom op is disabled + with patch('vllm_ascend.ops.rotary_embedding.enable_custom_op', + return_value=False): + result = _custom_rotary_embedding_enabled(self.query, True, + self.head_size) + self.assertFalse(result) + + +class TestAscendRotaryEmbedding(unittest.TestCase): + + def setUp(self): + # Common setup for tests + self.positions = torch.tensor([1, 2, 3]) + self.query = torch.randn(3, 1, 32, dtype=torch.float16) + self.key = torch.randn(3, 1, 32, dtype=torch.float16) + self.head_size = 32 + self.rotary_dim = self.head_size + self.max_position = 16 + self.rope_theta = 10000 + self.is_neox_style = True + self.cos_sin_cache = torch.randn(3, 1, 32) + self.layer = RotaryEmbedding(self.head_size, self.rotary_dim, + self.max_position, self.rope_theta, + self.is_neox_style, torch.float16) + + # Mock self object for rope_forward_oot + self.mock_self = MagicMock() + self.mock_self.head_size = self.head_size + self.mock_self.cos_sin_cache = self.cos_sin_cache + self.mock_self.is_neox_style = self.is_neox_style + + @patch('torch.ops._C') + @patch('vllm_ascend.ops.rotary_embedding.is_310p', return_value=False) + @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled', + return_value=True) + @patch('torch.ops._npu_rotary_embedding') + def test_rope_forward_oot_custom_kernel(self, mock_rotary_embedding, + mock_custom_enabled, mock_is_310p, + mock__c): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + # Setup mock for custom kernel path + + mock__c.rotary_embedding.return_value = self.query, self.key + + result_q, result_k = self.layer.forward(self.positions, self.query, + self.key) + + mock__c.rotary_embedding.assert_called_once() + self.assertEqual(result_q.shape, self.query.shape) + self.assertEqual(result_k.shape, self.key.shape) + + @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled', + return_value=False) + @patch('torch_npu._npu_rotary_embedding') + def test_rope_forward_oot_contiguous(self, mock_npu_rotary, + mock_custom_enabled): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + # Test contiguous path when custom is disabled + non_contig_query = self.query.transpose(0, 1) + non_contig_key = self.key.transpose(0, 1) + + result_q, result_k = self.layer.forward(self.positions, + non_contig_query, + non_contig_key) + + mock_npu_rotary.assert_called_once() + self.assertEqual(result_q.shape, non_contig_query.shape) + self.assertEqual(result_k.shape, non_contig_key.shape) + + def test_rope_forward_oot_with_offsets(self): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + # Test that NotImplementedError is raised when offsets is provided + offsets = torch.tensor([1, 2, 3]) + with self.assertRaises(NotImplementedError): + self.layer.forward(self.positions, self.query, self.key, offsets) + + @patch('vllm_ascend.ops.rotary_embedding._custom_rotary_embedding_enabled', + return_value=False) + @patch('torch_npu._npu_rotary_embedding') + def test_rope_forward_oot_neox_style_override(self, mock_npu_rotary, + mock_custom_enabled): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + # Test neox_style override + result_q, result_k = self.layer.forward(self.positions, + self.query, + self.key, + is_neox_style_override=False) + + # Check that neox_style=False was passed to the NPU function + args, kwargs = mock_npu_rotary.call_args + self.assertFalse(args[-1]) + + +class MockRopeModule: + + def __init__(self, max_seq_len=2048, is_neox_style=True): + self.max_seq_len = max_seq_len + self.is_neox_style = is_neox_style + self.cos_cached = None + self.sin_cached = None + self.rotary_dim = 1 + self.base = 1 + + +class TestAscendDeepseekScalingRotaryEmbedding(TestBase): + + def setUp(self): + # Common setup for tests + self.positions = torch.tensor([1, 2, 3]) + self.query = torch.randn(3, 1, 32, dtype=torch.float16) + self.key = torch.randn(3, 1, 32, dtype=torch.float16) + self.head_size = 32 + self.rotary_dim = self.head_size + self.max_position = 16 + self.rope_theta = 10000 + self.is_neox_style = True + self.scaling_factor = 1 + self.layer = None + + def _create_layer(self): + self.layer = DeepseekScalingRotaryEmbedding( + self.head_size, self.rotary_dim, self.max_position, + self.rope_theta, self.is_neox_style, self.scaling_factor, + torch.float16) + return self.layer + + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_native_rope_deepseek_forward_base(self, mock_npuplatform): + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + with patch("vllm_ascend.ops.rotary_embedding._rope_forward_oot", + return_value=(self.query, + self.key)) as mock_rope_forward_oot: + q_pe, k_pe = self.layer.forward(self.positions, self.query, + self.key) + mock_rope_forward_oot.assert_called_once() + assert q_pe.shape == self.query.shape + assert k_pe.shape == self.key.shape + + @patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot') + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_native_rope_deepseek_forward_cache_handling( + self, mock_npuplatform, mock_rope_forward_oot): + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + self.layer.max_seq_len = 1024 + # Test cache situation is true + with patch.object(self.layer, "_set_cos_sin_cache") as mock_set_cache: + mock_rope_forward_oot.return_value = (self.query, self.key) + + q_pe, k_pe = self.layer.forward(self.positions, + self.query, + self.key, + max_seq_len=2048) + mock_set_cache.assert_called_once() + assert q_pe.shape == self.query.shape + assert k_pe.shape == self.key.shape + + @patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot') + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_native_rope_deepseek_forward_key_reshaping( + self, mock_npuplatform, mock_rope_forward_oot): + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + + key = torch.randn(1, 32) + + mock_rope_forward_oot.return_value = (self.query, key) + + q_pe, k_pe = self.layer.forward(self.positions, self.query, key) + mock_rope_forward_oot.assert_called_once() + assert q_pe.shape == self.query.shape + assert k_pe.shape == key.shape + + @patch('vllm_ascend.ops.rotary_embedding._rope_forward_oot') + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_native_rope_deepseek_forward_non_neox_style( + self, mock_npuplatform, mock_rope_forward_oot): + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + + mock_rope_forward_oot.return_value = (self.query, self.key) + + q_pe, k_pe = self.layer.forward(self.positions, self.query, self.key) + + mock_rope_forward_oot.assert_called_once() + assert q_pe.shape == self.query.shape + assert k_pe.shape == self.key.shape + + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_basic_case(self, mock_npuplatform): + # Test with standard values + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + num_rotations = 100 + dim = 512 + base = 10000 + max_position_embeddings = 2048 + + result = self.layer._yarn_find_correction_dim(num_rotations, dim, base, + max_position_embeddings) + + # Calculate expected value manually + expected = (dim * torch.log( + torch.tensor(max_position_embeddings) / + (num_rotations * 2 * torch.pi))) / (2 * + torch.log(torch.tensor(base))) + + self.assertTrue(torch.allclose(result, expected)) + + @patch("vllm.platforms.current_platform.device_type", + new=torch.device("cpu")) + @patch("vllm_ascend.ops.rotary_embedding.NPUPlatform", + new_callable=PropertyMock) + def test_yarn_get_mscale(self, mock_npuplatform): + mock_npuplatform.device_type = torch.device("cpu") + self.layer = self._create_layer() + + # test_scale_less_than_or_equal_1 + self.assertEqual(self.layer._yarn_get_mscale(scale=0.5), 1.0) + self.assertEqual(self.layer._yarn_get_mscale(scale=1.0), 1.0) + self.assertEqual(self.layer._yarn_get_mscale(scale=0.999), 1.0) + + # test_scale_greater_than_1: + test_cases = [(2.0, 1.0, 1.0 + 0.1 * math.log(2.0)), + (10.0, 1.0, 1.0 + 0.1 * math.log(10.0)), + (5.0, 2.0, 1.0 + 0.2 * math.log(5.0)), + (math.e, 1.0, 1.0 + 0.1)] + + for scale, mscale, expected in test_cases: + result = self.layer._yarn_get_mscale(scale, mscale) + self.assertAlmostEqual( + result, + expected, + places=6, + msg=f"Failed for scale={scale}, mscale={mscale}") diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py new file mode 100644 index 0000000..9de8a13 --- /dev/null +++ b/tests/ut/ops/test_token_dispatcher.py @@ -0,0 +1,606 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from unittest.mock import MagicMock, PropertyMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.ops.moe_dispatcher.token_dispatcher import ( + AscendSocVersion, TokenDispatcherWithAll2AllV, + TokenDispatcherWithAllGather, TokenDispatcherWithMC2, _Dispatchers, + _register_token_dispatcher, get_token_dispatcher, setup_token_dispatchers) + + +class TestTokenDispatcherWithMC2(TestBase): + + def setUp(self): + self.mc2_group = MagicMock() + self.mc2_group.device_group.return_value._get_backend.return_value.get_hccl_comm_name.return_value = "hccl_123" + self.mc2_group.rank_in_group = 0 + self.mc2_group.world_size = 8 + self.mc2_group_patch = patch( + "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_mc2_group", + return_value=self.mc2_group) + self.mc2_group_patch.start() + + self.rank_group_patch = patch("torch.distributed.get_rank", + return_value=0) + self.rank_group_patch.start() + + # Mock get_forward_context().mc2_mask + self.forward_context = MagicMock() + self.forward_context.mc2_mask = torch.tensor([1, 0, 1]) + self.forward_context_patch = patch( + "vllm.forward_context.get_forward_context", + return_value=self.forward_context) + self.forward_context_patch.start() + + # Mock get_ascend_soc_version() + self.ascend_soc_version_patch = patch( + "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ascend_soc_version", + return_value=AscendSocVersion.A3) + self.ascend_soc_version_patch.start() + + kwargs = {"with_quant": False, "top_k": 8, "num_experts": 128} + self.dispatcher = TokenDispatcherWithMC2(**kwargs) + self.row_idx = torch.arange(10, dtype=torch.int32) + + def tearDown(self): + self.mc2_group_patch.stop() + self.forward_context_patch.stop() + self.ascend_soc_version_patch.stop() + + def test_init(self): + self.assertEqual(self.dispatcher.ep_rank_id, 0) + self.assertEqual(self.dispatcher.ep_world_size, 8) + self.assertFalse(self.dispatcher.with_quant) + self.assertTrue(self.dispatcher.enable_dispatch_v2) + self.assertTrue(self.dispatcher.need_extra_args) + self.assertTrue(self.dispatcher.a3_need_extra_args) + + def test_get_dispatch_mc2_kwargs_without_quant(self): + hidden_states = torch.randn(10, 128) + topk_ids = torch.randint(0, 8, (10, 1)) + topk_weights = torch.randn(10, 1) + expert_map = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + + kwargs = self.dispatcher.get_dispatch_mc2_kwargs( + hidden_states, topk_weights, topk_ids, expert_map) + self.assertIn("x", kwargs) + self.assertIn("expert_ids", kwargs) + self.assertEqual(kwargs["moe_expert_num"], 8) + + def test_token_permutation_dispatch(self): + hidden_states = torch.randn(10, 128) + topk_weights = torch.randn(10, 1) + topk_ids = torch.randint(0, 8, (10, 1)) + expert_map = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + + with patch("torch_npu.npu_moe_distribute_dispatch_v2", + return_value=(torch.randn(10, 128), ) * 5) as mock_dispatch: + output = self.dispatcher.token_dispatch(hidden_states, + topk_weights, topk_ids, + self.row_idx, expert_map) + mock_dispatch.assert_called_once() + self.assertEqual(output["group_list_type"], + 1) # group_list_type == 1 + + def test_token_dispatch_with_shared_experts_and_quant(self): + self.shared_experts = MagicMock() + self.shared_experts.gate_up_proj.return_value = (torch.randn(10, 128), + torch.tensor(1.0)) + self.shared_experts.act_fn.return_value = torch.randn(10, 128) + self.dispatcher.with_quant = False + self.dispatcher.shared_act = torch.randn(10, 128) + self.dispatcher.swiglu_out_scale = torch.tensor(1.0) + self.hidden_states = torch.randn(10, 128) + self.topk_weights = torch.randn(10, 1) + + with patch("torch_npu.npu_moe_distribute_dispatch_v2", + return_value=(torch.randn(10, 128), ) * 5): + self.dispatcher.token_dispatch(self.hidden_states, + self.topk_weights, + torch.randint(0, 8, (10, 1)), + self.row_idx, + torch.tensor( + [0, 1, 2, 3, 4, 5, 6, 7]), + shared_experts=self.shared_experts) + + def test_get_combine_mc_kwargs_with_quant(self): + self.dispatcher.with_quant = True + hidden_states = torch.randn(10, 128) + self.dispatcher.topk_ids = torch.randint(0, 8, (10, 1)) + self.dispatcher.topk_weights = torch.randint(0, 8, (10, 1)) + self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + self.dispatcher.ep_recv_counts = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + self.dispatcher.need_extra_args = True + self.dispatcher.enable_dispatch_v2 = True + self.dispatcher.output = torch.randint(0, 8, (10, 1)) + + kwargs = self.dispatcher.get_combine_mc_kwargs(hidden_states) + self.assertIn("tp_send_counts", kwargs) + + def test_token_combine_with_shared_experts(self): + self.dispatcher.shared_experts = MagicMock() + self.dispatcher.shared_experts.down_proj.return_value = (torch.randn( + 10, 128), torch.tensor(1.0)) + self.dispatcher.shared_act = torch.randn(10, 128) + self.dispatcher.with_quant = True + self.dispatcher.topk_ids = torch.randint(0, 8, (10, 1)) + self.dispatcher.topk_weights = torch.randint(0, 8, (10, 1)) + self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + self.dispatcher.ep_recv_counts = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7]) + self.dispatcher.need_extra_args = True + self.dispatcher.enable_dispatch_v2 = True + self.dispatcher.swiglu_out_scale = torch.randint(0, 8, (10, 1)) + self.dispatcher.output = torch.randint(0, 8, (10, 1)) + self.hidden_states = torch.randn(10, 128) + + with patch("torch_npu.npu_moe_distribute_combine_v2", + return_value=torch.randn(10, 128)): + self.dispatcher.token_combine(self.hidden_states) + + +class TestTokenDispatcherWithAllGather(TestBase): + + def setUp(self): + # Mock dependencies + kwargs = { + "apply_router_weight_on_input": False, + "top_k": 2, + "max_num_tokens": 100, + "ep_size": 2, + "num_experts": 128, + "with_quant": False, + } + self.dispatcher = TokenDispatcherWithAllGather(**kwargs) + + # Mock NPU functions + self.patcher_moe_init_routing = patch('torch_npu.npu_moe_init_routing') + self.mock_moe_init_routing = self.patcher_moe_init_routing.start() + self.mock_moe_init_routing.return_value = ( + torch.randn(6, 128), # sorted_hidden_states + torch.tensor([0, 1, 2, 3, 4, 5]), # expanded_row_idx + torch.tensor([0, 1, 0, 1, 0, 1]) # expanded_expert_idx + ) + + self.patcher_moe_compute_expert_tokens = patch( + 'torch_npu.npu_moe_compute_expert_tokens') + self.mock_moe_compute_expert_tokens = self.patcher_moe_compute_expert_tokens.start( + ) + self.mock_moe_compute_expert_tokens.return_value = torch.tensor( + [3, 3]) # expert_tokens + + self.patcher_moe_finalize_routing = patch( + 'torch_npu.npu_moe_finalize_routing') + self.mock_moe_finalize_routing = self.patcher_moe_finalize_routing.start( + ) + self.mock_moe_finalize_routing.return_value = torch.randn(3, 128) + self.row_idx = torch.arange(10, dtype=torch.int32) + + def tearDown(self): + self.patcher_moe_init_routing.stop() + self.patcher_moe_compute_expert_tokens.stop() + self.patcher_moe_finalize_routing.stop() + + def test_token_dispatch_without_expert_map(self): + hidden_states = torch.randn(3, 128) + topk_weights = torch.tensor([[0.7, 0.3], [0.6, 0.4], [0.5, 0.5]]) + topk_ids = torch.tensor([[0, 1], [1, 2], [2, 3]]) + + results = self.dispatcher.token_dispatch(hidden_states, topk_weights, + topk_ids, self.row_idx, None) + + # Verify npu_moe_init_routing is called + self.mock_moe_init_routing.assert_called_once() + args, kwargs = self.mock_moe_init_routing.call_args + + self.assertEqual(results["group_list_type"], 0) + + def test_token_dispatch_with_quant(self): + kwargs = { + "apply_router_weight_on_input": False, + "top_k": 2, + "max_num_tokens": 100, + "ep_size": 2, + "num_experts": 128, + } + self.dispatcher_quant = TokenDispatcherWithAllGather(**kwargs) + + hidden_states = torch.randn(3, 128) + topk_weights = torch.tensor([[0.7, 0.3], [0.6, 0.4], [0.5, 0.5]]) + topk_ids = torch.tensor([[0, 1], [1, 2], [2, 3]]) + + results = self.dispatcher_quant.token_dispatch(hidden_states, + topk_weights, topk_ids, + self.row_idx, None) + + self.assertEqual(results["group_list_type"], 0) + + def test_token_combine_with_expert_map(self): + self.dispatcher.expert_map = torch.tensor([0, 1, 2, 3]) + self.dispatcher.sorted_token_indices = torch.tensor([0, 1, 1, 1, 1, 1]) + self.dispatcher.sorted_weights = torch.tensor( + [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]) + self.dispatcher.original_shape = (3, 128) + self.dispatcher.mask = torch.tensor([0, 1, 1, 0]) + hidden_states = torch.randn(6, 128) + + final_hidden_states = self.dispatcher.token_combine(hidden_states) + + # Verify index_add_ is applied correctly + self.assertEqual(final_hidden_states.shape, (3, 128)) + + def test_token_combine_without_expert_map(self): + self.dispatcher.with_quant = False + self.dispatcher.expanded_row_idx = torch.tensor([0, 1, 1, 1, 1, 1]) + self.dispatcher.topk_ids = torch.tensor([[0, 1], [1, 2], [2, 3]]) + self.dispatcher.sorted_token_indices = torch.tensor([0, 1, 1, 1, 1, 1]) + self.dispatcher.sorted_weights = torch.tensor( + [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]) + self.dispatcher.original_shape = (3, 128) + self.dispatcher.mask = torch.tensor([0, 1, 1, 0]) + hidden_states = torch.randn(6, 128) + + final_hidden_states = self.dispatcher.token_combine(hidden_states) + + # Verify npu_moe_finalize_routing is called + self.mock_moe_finalize_routing.assert_called_once() + args, kwargs = self.mock_moe_finalize_routing.call_args + + self.assertEqual(final_hidden_states.shape, (3, 128)) + + def test_token_dispatch_with_router_weight(self): + self.dispatcher.apply_router_weight_on_input = True + hidden_states = torch.randn(3, 128) + topk_weights = torch.tensor([[0.7], [0.6], [0.5]]) # topk=1 + topk_ids = torch.tensor([[0], [1], [2]]) + + results = self.dispatcher.token_dispatch(hidden_states, topk_weights, + topk_ids, None) + self.assertEqual(results["hidden_states"].shape, (6, 128)) + + +class TestTokenDispatcherWithAll2AllV(TestBase): + + def setUp(self): + # Patch properties + patcher1 = patch.object(TokenDispatcherWithAll2AllV, + 'ep_group', + new_callable=PropertyMock, + return_value=MagicMock()) + patcher2 = patch.object(TokenDispatcherWithAll2AllV, + 'ep_rank', + new_callable=PropertyMock, + return_value=0) + patcher3 = patch.object(TokenDispatcherWithAll2AllV, + 'ep_size', + new_callable=PropertyMock, + return_value=2) + + self.addCleanup(patcher1.stop) + self.addCleanup(patcher2.stop) + self.addCleanup(patcher3.stop) + + self.mock_ep_group_prop = patcher1.start() + self.mock_ep_rank_prop = patcher2.start() + self.mock_ep_size_prop = patcher3.start() + + # Mock torch_npu.npu_moe_token_permute + patcher4 = patch('torch_npu.npu_moe_token_permute') + self.mock_npu_moe_token_permute = patcher4.start() + self.addCleanup(patcher4.stop) + self.mock_npu_moe_token_permute.return_value = (torch.randn(16, 16), + torch.arange(16)) + + # Mock torch_npu.npu_moe_token_unpermute + patcher5 = patch('torch_npu.npu_moe_token_unpermute') + self.mock_npu_moe_token_unpermute = patcher5.start() + self.addCleanup(patcher5.stop) + self.mock_npu_moe_token_unpermute.return_value = torch.randn(8, 16) + + # Mock async_all_to_all + patcher6 = patch('vllm_ascend.ops.comm_utils.async_all_to_all') + self.mock_async_all_to_all = patcher6.start() + self.addCleanup(patcher6.stop) + self.mock_async_all_to_all.return_value = (None, torch.randn(16, 16), + MagicMock()) + + # Mock gather_from_sequence_parallel_region + patcher7 = patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.gather_from_sequence_parallel_region' + ) + self.mock_gather_from_sequence_parallel_region = patcher7.start() + self.addCleanup(patcher7.stop) + self.mock_gather_from_sequence_parallel_region.return_value = torch.tensor( + [[2, 2, 2, 2], [2, 2, 2, 2]], dtype=torch.int64) + + # Mock torch.histc + patcher8 = patch('torch.histc') + self.mock_histc = patcher8.start() + self.addCleanup(patcher8.stop) + self.mock_histc.return_value = torch.tensor([2, 2, 2, 2], + dtype=torch.int64) + + # Mock torch.npu.current_device + patcher9 = patch('torch.npu.current_device') + self.mock_current_device = patcher9.start() + self.addCleanup(patcher9.stop) + self.mock_current_device.return_value = 'cpu' + + # Mock torch_npu.npu_dynamic_quant + patcher10 = patch('torch_npu.npu_dynamic_quant') + self.mock_npu_dynamic_quant = patcher10.start() + self.addCleanup(patcher10.stop) + self.mock_npu_dynamic_quant.return_value = (torch.randn(16, 16), + torch.randn(16)) + + # Mock torch_npu.npu_moe_init_routing_v2 + patcher11 = patch('torch_npu.npu_moe_init_routing_v2') + self.mock_npu_moe_init_routing_v2 = patcher11.start() + self.addCleanup(patcher11.stop) + self.mock_npu_moe_init_routing_v2.return_value = (torch.randn( + 16, 16), torch.arange(16), None, torch.randn(16)) + + # Mock torch.repeat_interleave + patcher12 = patch('torch.repeat_interleave') + self.mock_repeat_interleave = patcher12.start() + self.addCleanup(patcher12.stop) + self.mock_repeat_interleave.return_value = torch.arange(16) + + self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2, + num_experts=4, + num_local_experts=2, + with_quant=False) + self.row_idx = torch.arange(10, dtype=torch.int32) + + def test_token_dispatch(self): + hidden_states = torch.randn(8, 16) + topk_weights = torch.rand(8, 4) + topk_ids = torch.randint(0, 4, (8, 2)).long() + expert_map = torch.tensor([0, 1, 2, 3]) + + self.dispatcher.expert_ids_per_ep_rank = torch.tensor( + [0, 1], dtype=torch.int32) + self.dispatcher.local_expert_indices = [0, 1] + + result = self.dispatcher.token_dispatch(hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=self.row_idx, + expert_map=expert_map) + + self.assertIsNotNone(result["hidden_states"]) + self.assertIsNotNone(result["group_list"]) + self.assertEqual(result["group_list_type"], 1) + + def test_token_combine(self): + self.dispatcher.hidden_shape = (8, 16) + self.dispatcher.hidden_shape_before_permute = (8, 16) + self.dispatcher.reversed_local_input_permutation_mapping = torch.arange( + 8) + self.dispatcher.topk_weights = torch.rand(8, 4) + self.dispatcher.input_splits = [4, 4] + self.dispatcher.output_splits = [4, 4] + self.dispatcher.reversed_global_input_permutation_mapping = torch.arange( + 16) + + self.dispatcher.expert_ids_per_ep_rank = torch.tensor( + [0, 1], dtype=torch.int32) + self.dispatcher.local_expert_indices = [0, 1] + self.dispatcher.num_global_tokens_per_local_expert = torch.tensor( + [[2, 2], [2, 2]], dtype=torch.int64) + + expert_output = torch.randn(16, 16) + output = self.dispatcher.token_combine(expert_output) + + self.assertIsNotNone(output) + self.assertEqual(output.shape, (8, 16)) + + def test_token_dispatch_with_quant(self): + self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2, + num_experts=4, + num_local_experts=2) + + hidden_states = torch.randn(8, 16) + topk_weights = torch.rand(8, 4) + topk_ids = torch.randint(0, 4, (8, 2)).long() + expert_map = torch.tensor([0, 1, 2, 3]) + + self.dispatcher.expert_ids_per_ep_rank = torch.tensor( + [0, 1], dtype=torch.int32) + self.dispatcher.local_expert_indices = [0, 1] + + result = self.dispatcher.token_dispatch(hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=self.row_idx, + expert_map=expert_map, + with_quant=True) + + self.assertIsNotNone(result["hidden_states"]) + self.assertIsNotNone(result["group_list"]) + self.assertIsNotNone(result["dynamic_scale"]) + self.assertEqual(result["group_list_type"], 1) + + def test_token_dispatch_with_quant_no_active_tokens(self): + self.dispatcher = TokenDispatcherWithAll2AllV(top_k=2, + num_experts=4, + num_local_experts=2) + + self.mock_repeat_interleave.return_value = torch.tensor( + [], dtype=torch.long) + + hidden_states = torch.randn(8, 16) + topk_weights = torch.rand(8, 4) + topk_ids = torch.randint(0, 4, (8, 2)).long() + expert_map = torch.tensor([0, 1, 2, 3]) + + self.dispatcher.expert_ids_per_ep_rank = torch.tensor( + [0, 1], dtype=torch.int32) + self.dispatcher.local_expert_indices = [0, 1] + + result = self.dispatcher.token_dispatch(hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=self.row_idx, + expert_map=expert_map, + with_quant=True) + + self.assertIsNotNone(result["hidden_states"]) + self.assertIsNotNone(result["group_list"]) + self.assertIsNotNone(result["dynamic_scale"]) + self.assertEqual(result["group_list_type"], 1) + + def test_token_dispatch_with_log2phy(self): + hidden_states = torch.randn(8, 16) + topk_weights = torch.rand(8, 4) + topk_ids = torch.randint(0, 4, (8, 2)).long() + expert_map = torch.tensor([0, 1, 2, 3]) + log2phy = torch.tensor([1, 0, 3, 2]) + + self.dispatcher.expert_ids_per_ep_rank = torch.tensor( + [0, 1], dtype=torch.int32) + self.dispatcher.local_expert_indices = [0, 1] + + result = self.dispatcher.token_dispatch(hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=self.row_idx, + expert_map=expert_map, + log2phy=log2phy) + + self.assertIsNotNone(result["hidden_states"]) + self.assertIsNotNone(result["group_list"]) + self.assertEqual(result["group_list_type"], 1) + + +class TestDispatcherRegistry(TestBase): + + def setUp(self): + _Dispatchers.clear() + + def tearDown(self): + _Dispatchers.clear() + + def test_register_and_get_token_dispatcher(self): + mock_dispatcher = MagicMock() + mock_dispatcher.__class__.__name__ = "MockDispatcher" + + _register_token_dispatcher(mock_dispatcher) + + self.assertIn("MockDispatcher", _Dispatchers) + self.assertIs(_Dispatchers["MockDispatcher"], mock_dispatcher) + + retrieved_dispatcher = get_token_dispatcher("MockDispatcher") + self.assertIs(retrieved_dispatcher, mock_dispatcher) + + self.assertIsNone(get_token_dispatcher("NonExistentDispatcher")) + + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAllGather' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher' + ) + def test_setup_token_dispatchers_ep_size_1_creates_allgather( + self, mock_register, mock_allgather_class): + kwargs = {"top_k": 2, "num_experts": 8} + mock_instance = MagicMock() + mock_allgather_class.return_value = mock_instance + + self.assertNotIn("TokenDispatcherWithAllGather", _Dispatchers) + + setup_token_dispatchers(ep_size=1, **kwargs) + + mock_allgather_class.assert_called_once_with(**kwargs) + mock_register.assert_called_once_with(mock_instance) + + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher' + ) + def test_setup_token_dispatchers_ep_size_2_creates_all2allv( + self, mock_register, mock_all2allv_class): + kwargs = {"top_k": 2, "num_experts": 16, "num_local_experts": 2} + mock_instance = MagicMock() + mock_all2allv_class.return_value = mock_instance + + self.assertNotIn("TokenDispatcherWithAll2AllV", _Dispatchers) + + setup_token_dispatchers(ep_size=2, **kwargs) + + mock_all2allv_class.assert_called_once_with(**kwargs) + mock_register.assert_called_once_with(mock_instance) + + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithMC2' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher' + ) + def test_setup_token_dispatchers_ep_size_16_creates_all2allv_and_mc2( + self, mock_register, mock_mc2_class, mock_all2allv_class): + kwargs = {"top_k": 2, "num_experts": 32, "num_local_experts": 2} + mock_all2allv_instance = MagicMock() + mock_mc2_instance = MagicMock() + mock_all2allv_class.return_value = mock_all2allv_instance + mock_mc2_class.return_value = mock_mc2_instance + + self.assertNotIn("TokenDispatcherWithAll2AllV", _Dispatchers) + self.assertNotIn("TokenDispatcherWithMC2", _Dispatchers) + + setup_token_dispatchers(ep_size=16, **kwargs) + + mock_all2allv_class.assert_called_once_with(**kwargs) + mock_mc2_class.assert_called_once_with(**kwargs) + self.assertEqual(mock_register.call_count, 2) + mock_register.assert_any_call(mock_all2allv_instance) + mock_register.assert_any_call(mock_mc2_instance) + + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithMC2' + ) + @patch( + 'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher' + ) + def test_setup_token_dispatchers_ep_size_16_skips_if_exist( + self, mock_register, mock_mc2_class, mock_all2allv_class): + kwargs = {"top_k": 2, "num_experts": 32, "num_local_experts": 2} + mock_existing_all2allv = MagicMock() + mock_existing_mc2 = MagicMock() + _Dispatchers["TokenDispatcherWithAll2AllV"] = mock_existing_all2allv + _Dispatchers["TokenDispatcherWithMC2"] = mock_existing_mc2 + + setup_token_dispatchers(ep_size=16, **kwargs) + + mock_all2allv_class.assert_not_called() + mock_mc2_class.assert_not_called() + mock_register.assert_not_called() + self.assertIs(_Dispatchers["TokenDispatcherWithAll2AllV"], + mock_existing_all2allv) + self.assertIs(_Dispatchers["TokenDispatcherWithMC2"], + mock_existing_mc2) diff --git a/tests/ut/ops/test_vocab_parallel_embedding.py b/tests/ut/ops/test_vocab_parallel_embedding.py new file mode 100644 index 0000000..5378b19 --- /dev/null +++ b/tests/ut/ops/test_vocab_parallel_embedding.py @@ -0,0 +1,232 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/lora/test_layers.py + +import unittest +from unittest.mock import MagicMock, patch + +import torch + +from vllm_ascend.ops.vocab_parallel_embedding import ( + AscendLogitsProcessor, AscendParallelLMHead, AscendVocabParallelEmbedding) + +VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128 + + +class TestCustomVocabParallelEmbedding(unittest.TestCase): + + def setUp(self): + self.num_embeddings = 50 + self.embedding_dim = 10 + self.org_num_embeddings = 40 + self.padding_size = 8 + + def _create_layer(self): + # Patch methods and dependencies for VocabParallelEmbedding + mock_group = MagicMock() + mock_group.world_size = 2 + mock_group.rank_in_group = 0 + with patch("vllm_ascend.ops.vocab_parallel_embedding.get_tp_group", return_value=mock_group), \ + patch("vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank", return_value=0), \ + patch("vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size", return_value=2), \ + patch("vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size", side_effect=lambda x, y: x + y), \ + patch("vllm.model_executor.layers.vocab_parallel_embedding.divide", side_effect=lambda x, y: x // y): + + # Create an instance of VocabParallelEmbedding + layer = AscendVocabParallelEmbedding( + num_embeddings=self.num_embeddings, + embedding_dim=self.embedding_dim, + org_num_embeddings=self.org_num_embeddings, + padding_size=self.padding_size, + quant_config=None, # Mock quantization config + prefix="") + + layer.shard_indices = MagicMock() + layer.shard_indices.org_vocab_start_index = 10 + layer.shard_indices.org_vocab_end_index = 20 + layer.shard_indices.num_org_vocab_padding = 5 + layer.shard_indices.added_vocab_start_index = 30 + layer.shard_indices.added_vocab_end_index = 40 + + # Mock the quantization method + layer.quant_method.embedding = MagicMock( + side_effect=lambda _, x: torch.randn(x.shape[0], self. + embedding_dim)) + return layer + + def test_get_masked_input_and_mask(self): + """Test the mask and offset calculation helper function.""" + layer = self._create_layer() + + input_ = torch.tensor([5, 15, 25, 35, 45]) + + masked_input, mask = layer._get_masked_input_and_mask( + input_, + org_vocab_start_index=10, + org_vocab_end_index=20, + num_org_vocab_padding=5, + added_vocab_start_index=30, + added_vocab_end_index=40) + + expected_mask = torch.tensor([True, False, True, False, True]) + self.assertTrue( + torch.equal(mask, expected_mask), + f"Mask mismatch. Expected {expected_mask}, got {mask}") + + expected_masked = torch.tensor([0, 5, 0, 20, 0]) + self.assertTrue( + torch.equal(masked_input, expected_masked), + f"Masked input mismatch. Expected {expected_masked}, got {masked_input}" + ) + + def test_forward_with_tp_size_1(self): + """Test forward pass without tensor parallelism.""" + # Create a fresh mock embedding with tp_size=1 + layer = self._create_layer() + layer.tp_size = 1 + layer.quant_method.embedding = MagicMock( + return_value=torch.randn(3, layer.embedding_dim)) + + input_ = torch.tensor([1, 2, 3]) + + with patch( + "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", + side_effect=lambda x: x) as mock_reduce_tp1: + output = layer.forward(input_) + + # Should just pass through without masking + layer.quant_method.embedding.assert_called_once_with( + layer, input_.long()) + self.assertEqual(output.shape, (3, layer.embedding_dim)) + + # Verify all_reduce was called once + mock_reduce_tp1.assert_called_once() + + def test_forward_with_tp(self): + layer = self._create_layer() + layer.tp_size = 2 + + input_ = torch.tensor([15, 35]) # one org vocab, one added vocab + + with patch( + "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", + side_effect=lambda x: x) as mock_reduce_tp: + # Call the forward method + output = layer.forward(input_) + + # Check that masking was applied correctly + layer.quant_method.embedding.assert_called_once() + called_input = layer.quant_method.embedding.call_args[0][1] + expected_input = torch.tensor([5, 20]) # after offset calculation + self.assertTrue(torch.all(called_input == expected_input)) + + # Check that all reduce was called + mock_reduce_tp.assert_called_once() + self.assertEqual(output.shape, (2, self.embedding_dim)) + + def test_forward_with_invalid_vocab(self): + """Test that invalid vocab indices are properly masked out.""" + # Create a fresh embedding layer + layer = self._create_layer() + input_ = torch.tensor([5, 15, 25, 35, 45]) # includes invalid cases + # Create predictable mock output + mock_output = torch.randn(5, self.embedding_dim) + layer.quant_method.embedding = MagicMock( + return_value=mock_output.clone()) + + # Patch tensor_model_parallel_all_reduce to mock its behavior + with patch( + "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", + side_effect=lambda x: x): + # Call the forward method + output = layer.forward(input_) + # Check that invalid positions (0, 2, 4) were zeroed out + self.assertTrue(torch.all(output[0] == 0)) + self.assertTrue(torch.all(output[2] == 0)) + self.assertTrue(torch.all(output[4] == 0)) + self.assertTrue(torch.all(output[1] == mock_output[1])) + self.assertTrue(torch.all(output[3] == mock_output[3])) + self.assertEqual(output.shape, (5, self.embedding_dim)) + + def test_output_shape(self): + """Test that output shape is correct.""" + # Create a fresh embedding layer + layer = self._create_layer() + + test_cases = [ + (torch.tensor([15]), (1, self.embedding_dim)), + (torch.tensor([15, 35]), (2, self.embedding_dim)), + (torch.tensor([15, 35, 16, 36]), (4, self.embedding_dim)), + ] + + for input_, expected_shape in test_cases: + with self.subTest(input=input_): + with patch( + "vllm_ascend.ops.vocab_parallel_embedding.tensor_model_parallel_all_reduce", + side_effect=lambda x: x): + # Call the forward method + output = layer.forward(input_) + self.assertEqual(output.shape, expected_shape) + + +class TestAscendLogitsProcessor(unittest.TestCase): + + def setUp(self): + self.vocab_size = 50 + self.num_embeddings = 50 + self.embedding_dim = 10 + self.org_num_embeddings = 40 + self.padding_size = 8 + + self.mock_group = MagicMock() + self.mock_group.world_size = 2 + self.mock_group.rank_in_group = 0 + self.mock_ascend_config = MagicMock() + self.mock_quant_method = MagicMock() + self.mock_quant_method.apply = MagicMock( + return_value=torch.randn(1, self.vocab_size)) + self.patches = [ + patch("vllm_ascend.ascend_config.get_ascend_config", + return_value=self.mock_ascend_config), + patch( + "vllm_ascend.ops.vocab_parallel_embedding.get_lmhead_tp_group", + return_value=self.mock_group), + patch("vllm_ascend.ops.vocab_parallel_embedding.lmhead_tp_enable", + return_value=True), + patch( + "vllm_ascend.ops.vocab_parallel_embedding.get_lmhead_tp_group.all_to_all", + return_value=torch.randn(1, self.vocab_size)) + ] + + for p in self.patches: + p.start() + + def tearDown(self): + for p in self.patches: + p.stop() + + def test_create_processor(self): + processor = AscendLogitsProcessor(vocab_size=self.vocab_size) + self.assertEqual(processor.vocab_size, self.vocab_size) + + def test_get_logits(self): + processor = AscendLogitsProcessor(vocab_size=self.vocab_size) + lmhead = AscendParallelLMHead(num_embeddings=self.num_embeddings, + embedding_dim=self.embedding_dim, + prefix="lm_head") + lmhead.quant_method = self.mock_quant_method + lmhead.quant_method.apply = self.mock_quant_method.apply + hidden_state = torch.randn(1, self.org_num_embeddings) + processor._get_logits(hidden_state, lmhead) + self.mock_quant_method.apply.assert_called_once() diff --git a/tests/ut/patch/worker/patch_common/test_patch_distributed.py b/tests/ut/patch/worker/patch_common/test_patch_distributed.py new file mode 100644 index 0000000..4975313 --- /dev/null +++ b/tests/ut/patch/worker/patch_common/test_patch_distributed.py @@ -0,0 +1,112 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from unittest.mock import MagicMock, patch + +import torch +from vllm.distributed.parallel_state import GroupCoordinator + +from tests.ut.base import TestBase +from vllm_ascend.patch.worker.patch_common.patch_distributed import \ + GroupCoordinatorPatch + + +class TestPatchDistributed(TestBase): + + def setUp(self): + self.mock_group_ranks = [[0, 1]] + self.mock_local_rank = 0 + self.mock_backend = "hccl" + self.mock_use_device_comm = True + + patcher_get_rank = patch("torch.distributed.get_rank", return_value=0) + patcher_new_group = patch("torch.distributed.new_group", + return_value=MagicMock()) + patcher_is_cuda_alike = patch( + "vllm.platforms.current_platform.is_cuda_alike", return_value=True) + patcher_device_comm_cls = patch( + "vllm.distributed.parallel_state.resolve_obj_by_qualname", + return_value=MagicMock()) + + self.mock_get_rank = patcher_get_rank.start() + self.mock_new_group = patcher_new_group.start() + self.mock_is_cuda_alike = patcher_is_cuda_alike.start() + self.mock_resolve_obj = patcher_device_comm_cls.start() + + self.addCleanup(patcher_get_rank.stop) + self.addCleanup(patcher_new_group.stop) + self.addCleanup(patcher_is_cuda_alike.stop) + self.addCleanup(patcher_device_comm_cls.stop) + + self.group_coordinator = GroupCoordinatorPatch( + group_ranks=self.mock_group_ranks, + local_rank=self.mock_local_rank, + torch_distributed_backend=self.mock_backend, + use_device_communicator=self.mock_use_device_comm) + + def test_GroupCoordinator_patched(self): + self.assertIs(GroupCoordinator, GroupCoordinatorPatch) + + def test_all_to_all_returns_input_when_world_size_1(self): + self.group_coordinator.world_size = 1 + input_tensor = torch.randn(2, 3) + output = self.group_coordinator.all_to_all(input_tensor) + self.assertTrue(torch.equal(output, input_tensor)) + + def test_all_to_all_raises_assertion_on_invalid_scatter_dim(self): + input_tensor = torch.randn(2, 3) + with self.assertRaises(AssertionError) as cm: + self.group_coordinator.all_to_all(input_tensor, scatter_dim=2) + self.assertIn("Invalid scatter dim", str(cm.exception)) + + def test_all_to_all_raises_assertion_on_invalid_gather_dim(self): + input_tensor = torch.randn(2, 3) + with self.assertRaises(AssertionError) as cm: + self.group_coordinator.all_to_all(input_tensor, gather_dim=2) + self.assertIn("Invalid gather dim", str(cm.exception)) + + def test_all_to_all_calls_device_communicator_with_correct_args(self): + mock_communicator = MagicMock() + self.group_coordinator.device_communicator = mock_communicator + + input_tensor = torch.randn(2, 3) + scatter_dim = 0 + gather_dim = 1 + scatter_sizes = [1, 1] + gather_sizes = [1, 1] + + self.group_coordinator.all_to_all(input_tensor, + scatter_dim=scatter_dim, + gather_dim=gather_dim, + scatter_sizes=scatter_sizes, + gather_sizes=gather_sizes) + + mock_communicator.all_to_all.assert_called_once_with( + input_tensor, scatter_dim, gather_dim, scatter_sizes, gather_sizes) + + def test_all_to_all_calls_device_communicator_without_sizes(self): + mock_communicator = MagicMock() + self.group_coordinator.device_communicator = mock_communicator + + input_tensor = torch.randn(2, 3) + scatter_dim = 0 + gather_dim = 1 + + self.group_coordinator.all_to_all(input_tensor, + scatter_dim=scatter_dim, + gather_dim=gather_dim) + + mock_communicator.all_to_all.assert_called_once_with( + input_tensor, scatter_dim, gather_dim, None, None) diff --git a/tests/ut/patch/worker/patch_common/test_patch_linear.py b/tests/ut/patch/worker/patch_common/test_patch_linear.py new file mode 100644 index 0000000..b7fbbc4 --- /dev/null +++ b/tests/ut/patch/worker/patch_common/test_patch_linear.py @@ -0,0 +1,167 @@ +from importlib import reload + +import pytest +import torch +import vllm +from pytest_mock import MockerFixture + +import vllm_ascend.envs as envs_ascend +from tests.ut.base import PytestBase +from vllm_ascend.patch.worker.patch_common import patch_linear + + +class TestAscendRowParallelLinear(PytestBase): + + def init_row_parallel_linear(self, mocker: MockerFixture): + mocker.patch( + "vllm_ascend.patch.worker.patch_common.patch_linear.AscendRowParallelLinear.__init__", + return_value=None, + ) + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + return patch_linear.AscendRowParallelLinear( + input_size=128, + output_size=256, + ) + + @pytest.mark.parametrize( + "version, expected", + [ + ("1.0.0", 1), + ("2.1.0", 1), + ], + ) + def test_get_hcomm_info(self, version, expected, mocker: MockerFixture): + mock_group = mocker.MagicMock() + backend = mocker.MagicMock() + backend.get_hccl_comm_name = lambda x: x + mock_group._get_backend = lambda x: backend + mock_group.get_hccl_comm_name = lambda x: x + mocker.patch("torch.distributed.get_rank", return_value=1) + mocker.patch( + "torch.distributed.get_global_rank", + return_value=0, + ) + mocker.patch("torch.__version__", new=version) + hcomm_info = patch_linear.AscendRowParallelLinear.get_hcomm_info( + mock_group) + assert hcomm_info == expected + + @pytest.mark.parametrize( + "skip_bias_add, return_bias, bias, expected", + [ + (True, False, torch.tensor(1.0), torch.tensor(14.0)), + (False, True, torch.tensor(1.0), (torch.tensor(14.0), None)), + ( + True, + True, + torch.tensor(1.0), + (torch.tensor(14.0), torch.tensor(1.0)), + ), + ], + ) + def test_forward( + self, + skip_bias_add, + return_bias, + bias, + expected, + mocker: MockerFixture, + ): + mocker_tp_group = mocker.MagicMock() + mocker_tp_group.device_group = mocker.MagicMock() + row_parallel_linear = self.init_row_parallel_linear(mocker) + row_parallel_linear.__dict__["tp_rank"] = 0 + row_parallel_linear.__dict__["skip_bias_add"] = skip_bias_add + row_parallel_linear.__dict__["return_bias"] = return_bias + row_parallel_linear.__dict__["bias"] = bias + row_parallel_linear.__dict__["qyuant_method"] = mocker.MagicMock() + row_parallel_linear.__dict__["calc_input"] = lambda x: x # noqa + row_parallel_linear.__dict__[ + "calc_output"] = lambda x: x.matmul( # noqa + torch.tensor([1.0, 2.0])) + ret = row_parallel_linear.forward(torch.tensor([10.0, 2.0])) + if isinstance(ret, tuple): + assert torch.allclose(ret[0], expected[0]) + if ret[1] is None: + assert ret[1] == expected[1] + else: + assert torch.allclose(ret[1], expected[1]) + else: + assert torch.allclose(ret, expected) + + @pytest.mark.parametrize( + "input_is_parallel, expected", + [ + (True, torch.tensor([10.0, 2.0])), + (False, torch.tensor([10.0])), + ], + ) + def test_calc_input( + self, + input_is_parallel, + expected, + mocker: MockerFixture, + ): + row_parallel_linear = self.init_row_parallel_linear(mocker) + row_parallel_linear.__dict__["input_is_parallel"] = input_is_parallel + input_tensor = torch.Tensor([10, 2]) + mocker.patch( + "vllm_ascend.patch.worker.patch_common.patch_linear.get_tensor_model_parallel_rank", # noqa + return_value=0, + ) + mocker.patch( + "vllm_ascend.patch.worker.patch_common.patch_linear.split_tensor_along_last_dim", # noqa + return_value=[torch.Tensor([10]), + torch.Tensor([2])], + ) + input_parallel = row_parallel_linear.calc_input(input_tensor) + assert torch.allclose(input_parallel, expected) + + @pytest.mark.parametrize( + "reduce_results, tp_size, expected", + [ + (True, 2, torch.tensor(56.0)), + (True, 1, torch.tensor(14.0)), + (False, 2, torch.tensor(14.0)), + ], + ) + def test_calc_output( + self, + reduce_results, + tp_size, + expected, + mocker: MockerFixture, + ): + quant_method = mocker.MagicMock() + quant_method.apply = lambda self, x, bias=None: x.matmul( # noqa + torch.tensor([1.0, 2.0])) + row_parallel_linear = self.init_row_parallel_linear(mocker) + row_parallel_linear.__dict__["reduce_results"] = reduce_results + row_parallel_linear.__dict__["tp_size"] = tp_size + row_parallel_linear.__dict__["quant_method"] = quant_method + row_parallel_linear.__dict__["tp_rank"] = 0 + row_parallel_linear.__dict__["get_hcomm_info"] = lambda x: None # noqa + + mocker.patch( + "vllm_ascend.patch.worker.patch_common.patch_linear.get_tp_group", + return_value=mocker.MagicMock(device_group=mocker.MagicMock()), + ) + mocker.patch( + "torch_npu.npu_mm_all_reduce_base", + side_effect=lambda input_, weight, hccl_info, bias: input_. + matmul( # noqa + torch.tensor([4.0, 8.0])), + ) # noqa + ret = row_parallel_linear.calc_output(torch.tensor([10.0, 2.0])) + assert torch.allclose(ret, expected) + + def test_enable_allreduce_matmul(self, mocker: MockerFixture): + mocker.patch.object(envs_ascend, + "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", + new=True) + reload(patch_linear) + assert envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE + assert id(vllm.model_executor.layers.linear.RowParallelLinear) == id( + patch_linear.AscendRowParallelLinear) diff --git a/tests/ut/patch/worker/patch_common/test_patch_minicpm.py b/tests/ut/patch/worker/patch_common/test_patch_minicpm.py new file mode 100644 index 0000000..47d1957 --- /dev/null +++ b/tests/ut/patch/worker/patch_common/test_patch_minicpm.py @@ -0,0 +1,77 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from unittest.mock import MagicMock + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.patch.worker.patch_common.patch_minicpm import forward + + +class TestPatchMiniCPM(TestBase): + + def setUp(self): + self.mock_self = MagicMock() + + self.mock_self.q_size = 128 + self.mock_self.kv_size = 128 + + self.mock_self.qkv_proj = MagicMock() + self.mock_self.rotary_emb = MagicMock() + self.mock_self.attn = MagicMock() + self.mock_self.o_proj = MagicMock() + + self.positions = torch.tensor([1, 2, 3]) + self.hidden_states = torch.randn(3, 256) + + self.mock_qkv = torch.randn(3, 384) + self.mock_q = self.mock_qkv[:, :128] + self.mock_k = self.mock_qkv[:, 128:256] + self.mock_v = self.mock_qkv[:, 256:] + + self.mock_self.qkv_proj.return_value = (self.mock_qkv, None) + self.mock_self.rotary_emb.return_value = (self.mock_q, self.mock_k) + self.mock_self.attn.return_value = torch.randn(3, 256) + self.mock_self.o_proj.return_value = (torch.randn(3, 256), None) + + def test_forward_patched(self): + from vllm.model_executor.models.minicpm import MiniCPMAttention + + self.assertIs(MiniCPMAttention.forward, forward) + + def test_forward_function(self): + result = forward(self.mock_self, self.positions, self.hidden_states) + + self.mock_self.qkv_proj.assert_called_once_with(self.hidden_states) + + args, _ = self.mock_self.rotary_emb.call_args + self.assertEqual(len(args), 3) + self.assertTrue(torch.equal(args[0], self.positions)) + self.assertTrue(torch.equal(args[1], self.mock_q)) + self.assertTrue(torch.equal(args[2], self.mock_k)) + + args, _ = self.mock_self.attn.call_args + self.assertEqual(len(args), 3) + self.assertTrue(torch.equal(args[0], self.mock_q)) + self.assertTrue(torch.equal(args[1], self.mock_k)) + self.assertTrue(torch.equal(args[2], self.mock_v)) + + self.mock_self.o_proj.assert_called_once_with( + self.mock_self.attn.return_value) + + self.assertEqual(result.shape, (3, 256)) + self.assertTrue( + torch.equal(result, self.mock_self.o_proj.return_value[0])) diff --git a/tests/ut/quantization/test_func_wrapper.py b/tests/ut/quantization/test_func_wrapper.py new file mode 100644 index 0000000..5020f80 --- /dev/null +++ b/tests/ut/quantization/test_func_wrapper.py @@ -0,0 +1,134 @@ +from unittest.mock import patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.quantization.func_wrapper import (wrapper_rmsnorm_forward_oot, + wrapper_rmsnorm_init) + + +class MockRMSNorm: + + def __init__(self, hidden_size: int, **extra_args): + self.hidden_size = hidden_size + self.weight = torch.ones(hidden_size) + self.input_scale = 1.0 + self.input_offset = 0.0 + self.variance_epsilon = 1e-6 + self.bias = torch.nn.Parameter(torch.zeros(hidden_size), + requires_grad=False) + self.ignore_anti = extra_args.get('ignore_anti', True) + + +class TestFuncWrapper(TestBase): + + def test_wrapper_rmsnorm_init(self): + + @wrapper_rmsnorm_init + def init(self, hidden_size: int, **extra_args) -> None: + self.hidden_size = hidden_size + + hidden_size = 128 + extra_args = {'arg1': 'value1'} + + rms_norm = MockRMSNorm(hidden_size, **extra_args) + init(rms_norm, hidden_size, **extra_args) + + self.assertTrue(hasattr(rms_norm, 'ignore_anti')) + self.assertTrue(rms_norm.ignore_anti) + + self.assertTrue(hasattr(rms_norm, 'bias')) + self.assertIsInstance(rms_norm.bias, torch.nn.Parameter) + self.assertEqual(rms_norm.bias.shape, torch.Size([hidden_size])) + self.assertFalse(rms_norm.bias.requires_grad) + + @patch('torch_npu._npu_quant_rms_norm') + def test_wrapper_rmsnorm_forward_oot_with_residual( + self, mock_npu_quant_rms_norm): + hidden_size = 128 + x = torch.randn(hidden_size) + residual = torch.randn(hidden_size) + expected_out = torch.randn(hidden_size) + + mock_npu_quant_rms_norm.return_value = (expected_out, residual) + + @wrapper_rmsnorm_forward_oot + def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None): + return x, residual + + rms_norm = MockRMSNorm(hidden_size) + rms_norm.ignore_anti = False + + output, res = forward_oot(rms_norm, x, residual) + + mock_npu_quant_rms_norm.assert_called_once() + + args, kwargs = mock_npu_quant_rms_norm.call_args + self.assertTrue(torch.equal(args[1], rms_norm.weight)) + self.assertTrue(torch.equal(args[2], rms_norm.bias)) + self.assertEqual(args[3], rms_norm.input_scale) + self.assertEqual(args[4], rms_norm.input_offset) + self.assertEqual(args[5], rms_norm.variance_epsilon) + self.assertTrue(torch.equal(res, residual)) + + @patch('torch_npu._npu_quant_rms_norm') + def test_wrapper_rmsnorm_forward_oot_without_residual( + self, mock_npu_quant_rms_norm): + hidden_size = 128 + x = torch.randn(hidden_size) + expected_out = torch.randn(hidden_size) + + mock_npu_quant_rms_norm.return_value = expected_out + + @wrapper_rmsnorm_forward_oot + def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None): + return x + + rms_norm = MockRMSNorm(hidden_size) + rms_norm.ignore_anti = False + + output = forward_oot(rms_norm, x) + + mock_npu_quant_rms_norm.assert_called_once() + + args, kwargs = mock_npu_quant_rms_norm.call_args + self.assertTrue(torch.equal(args[0], x)) + self.assertTrue(torch.equal(args[1], rms_norm.weight)) + self.assertTrue(torch.equal(args[2], rms_norm.bias)) + self.assertEqual(args[3], rms_norm.input_scale) + self.assertEqual(args[4], rms_norm.input_offset) + self.assertEqual(args[5], rms_norm.variance_epsilon) + + self.assertTrue(torch.equal(output, expected_out)) + + def test_wrapper_rmsnorm_forward_oot_ignore_anti_with_residual(self): + hidden_size = 128 + x = torch.randn(hidden_size) + residual = torch.randn(hidden_size) + + @wrapper_rmsnorm_forward_oot + def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None): + return x, residual + + rms_norm = MockRMSNorm(hidden_size) + rms_norm.ignore_anti = True + + output, res = forward_oot(rms_norm, x, residual) + + self.assertTrue(torch.equal(output, x.add_(rms_norm.bias))) + self.assertTrue(torch.equal(res, residual)) + + def test_wrapper_rmsnorm_forward_oot_ignore_anti_no_residual(self): + hidden_size = 128 + x = torch.randn(hidden_size) + + @wrapper_rmsnorm_forward_oot + def forward_oot(self, x: torch.Tensor, residual: torch.Tensor = None): + return x + + rms_norm = MockRMSNorm(hidden_size) + rms_norm.ignore_anti = True + + output = forward_oot(rms_norm, x) + + self.assertTrue(torch.equal(output, x.add_(rms_norm.bias))) diff --git a/tests/ut/quantization/test_quant_config.py b/tests/ut/quantization/test_quant_config.py new file mode 100644 index 0000000..7529fea --- /dev/null +++ b/tests/ut/quantization/test_quant_config.py @@ -0,0 +1,232 @@ +from unittest.mock import MagicMock, patch + +import torch +from vllm.attention.layer import Attention +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) + +from tests.ut.base import TestBase +from vllm_ascend.quantization.quant_config import (AscendKVCacheMethod, + AscendQuantConfig) +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD + + +class TestAscendQuantConfig(TestBase): + + def setUp(self): + self.sample_config = { + "weight": "INT8", + "fa_quant_type": "C8", + "kv_quant_type": "C8", + "layer1.weight": "INT8", + "layer2.weight": "FLOAT", + "fused_layer.weight": "FLOAT", + "fused_layer.shard1.weight": "FLOAT", + "fused_layer.shard2.weight": "FLOAT", + "shard1.weight": "FLOAT", + "shard2.weight": "FLOAT", + } + self.ascend_config = AscendQuantConfig(self.sample_config) + self.ascend_config.packed_modules_mapping = None + + def test_init(self): + self.assertEqual(self.ascend_config.quant_description, + self.sample_config) + + def test_repr(self): + repr_str = repr(self.ascend_config) + self.assertTrue(repr_str.startswith("AscendQuantConfig:\n")) + + def test_get_name(self): + self.assertEqual(AscendQuantConfig.get_name(), + ASCEND_QUANTIZATION_METHOD) + + def test_get_supported_act_dtypes(self): + supported_dtypes = AscendQuantConfig.get_supported_act_dtypes() + self.assertEqual(len(supported_dtypes), 3) + + def test_get_min_capability(self): + with self.assertRaises(NotImplementedError): + AscendQuantConfig.get_min_capability() + + def test_get_config_filenames(self): + filenames = AscendQuantConfig.get_config_filenames() + self.assertEqual(filenames, ["quant_model_description.json"]) + + def test_from_config(self): + config = AscendQuantConfig.from_config(self.sample_config) + self.assertIsInstance(config, AscendQuantConfig) + self.assertEqual(config.quant_description, self.sample_config) + + @patch('torch.npu.is_available') + def test_override_quantization_method(self, mock_is_available): + # Test when NPU is available + mock_is_available.return_value = True + result = AscendQuantConfig.override_quantization_method(None, None) + self.assertEqual(result, ASCEND_QUANTIZATION_METHOD) + + # Test when NPU is not available + mock_is_available.return_value = False + result = AscendQuantConfig.override_quantization_method(None, None) + self.assertIsNone(result) + + def test_get_quant_method_for_linear(self): + linear_layer = MagicMock(spec=LinearBase) + # Test skipped layer + with patch.object(self.ascend_config, + 'is_layer_skipped_ascend', + return_value=True): + method = self.ascend_config.get_quant_method(linear_layer, ".attn") + self.assertIsInstance(method, UnquantizedLinearMethod) + + # Test quantized layer + with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \ + patch('vllm_ascend.quantization.quant_config.AscendLinearMethod', return_value=MagicMock()) as mock_ascend_linear: + + method = self.ascend_config.get_quant_method(linear_layer, ".attn") + self.assertIs(method, mock_ascend_linear.return_value) + mock_ascend_linear.assert_called_once_with( + self.ascend_config, ".attn", + self.ascend_config.packed_modules_mapping) + + def test_get_quant_method_for_attention(self): + attention_layer = MagicMock(spec=Attention) + with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', + return_value=MagicMock()) as mock_ascend_kvcache: + # Test with fa_quant_type + method = self.ascend_config.get_quant_method( + attention_layer, ".attn") + self.assertIs(method, mock_ascend_kvcache.return_value) + + with patch('vllm_ascend.quantization.quant_config.AscendKVCacheMethod', + return_value=MagicMock()) as mock_ascend_kvcache: + # Test with kv_quant_type + modified_config = {"kv_quant_type": "C8"} + config = AscendQuantConfig(modified_config) + config.packed_modules_mapping = None + method = config.get_quant_method(attention_layer, "attn") + self.assertIs(method, mock_ascend_kvcache.return_value) + + def test_get_quant_method_for_fused_moe(self): + fused_moe_layer = MagicMock(spec=FusedMoE) + fused_moe_layer.moe = MagicMock(spec=FusedMoEConfig) + fused_moe_layer.moe_config = MagicMock(spec=FusedMoEConfig) + + # Test skipped layer + with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=True), \ + patch('vllm_ascend.quantization.quant_config.AscendUnquantizedFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe: + method = self.ascend_config.get_quant_method( + fused_moe_layer, "moe_layer") + self.assertIs(method, mock_ascend_moe.return_value) + + # Test quantized layer + with patch.object(self.ascend_config, 'is_layer_skipped_ascend', return_value=False), \ + patch('vllm_ascend.quantization.quant_config.AscendFusedMoEMethod', return_value=MagicMock()) as mock_ascend_moe: + method = self.ascend_config.get_quant_method( + fused_moe_layer, "moe_layer") + self.assertIs(method, mock_ascend_moe.return_value) + + def test_is_layer_skipped_ascend(self): + # Test non-fused layer that should be quantized + self.assertFalse(self.ascend_config.is_layer_skipped_ascend("layer1")) + + # Test non-fused layer that should be skipped + self.assertTrue(self.ascend_config.is_layer_skipped_ascend("layer2")) + + # Test fused layer + fused_mapping = {"fused_layer": ["shard1", "shard2"]} + self.assertTrue( + self.ascend_config.is_layer_skipped_ascend("fused_layer", + fused_mapping)) + + # Test inconsistent fused layer shards + bad_config = {"shard1.weight": "FLOAT", "shard2.weight": "INT8"} + config = AscendQuantConfig(bad_config) + with self.assertRaises(ValueError): + config.is_layer_skipped_ascend("fused_layer", fused_mapping) + + def test_get_scaled_act_names(self): + self.assertEqual(self.ascend_config.get_scaled_act_names(), []) + + +class TestAscendKVCacheMethod(TestBase): + + def setUp(self): + # Setup common test fixtures + self.mock_quant_config = MagicMock(spec=AscendQuantConfig) + self.mock_quant_config.quant_description = {"some_config": "value"} + self.prefix = "attention_layer" + + # Mock the quantizer and quant_method + self.mock_quantizer = MagicMock() + self.mock_quant_method = MagicMock() + + # Patch the AscendQuantizer + self.quantizer_patcher = patch( + 'vllm_ascend.quantization.quant_config.AscendQuantizer.get_quantizer', + return_value=self.mock_quantizer) + self.mock_get_quantizer = self.quantizer_patcher.start() + + self.mock_quantizer.build_attention_method.return_value = self.mock_quant_method + + # Create instance + self.kv_cache_method = AscendKVCacheMethod(self.mock_quant_config, + self.prefix) + + def tearDown(self): + self.quantizer_patcher.stop() + + def test_init(self): + """Test initialization with proper quantizer setup.""" + self.mock_get_quantizer.assert_called_once_with( + self.mock_quant_config.quant_description, self.prefix) + self.mock_quantizer.build_attention_method.assert_called_once() + + def test_create_weights(self): + """Test create_weights delegates to quant_method.""" + mock_layer = MagicMock() + self.kv_cache_method.create_weights(mock_layer) + self.mock_quant_method.create_weights.assert_called_once_with( + mock_layer) + + def test_process_weights_after_loading_with_method(self): + """Test process_weights when quant_method has the method.""" + mock_layer = MagicMock() + self.kv_cache_method.process_weights_after_loading(mock_layer) + self.mock_quant_method.process_weights_after_loading.assert_called_once_with( + mock_layer) + + def test_process_weights_after_loading_without_method(self): + """Test process_weights when quant_method lacks the method.""" + # Reset mock to remove the method + del self.mock_quant_method.process_weights_after_loading + mock_layer = MagicMock() + + # Should not raise exception + self.kv_cache_method.process_weights_after_loading(mock_layer) + + def test_apply_delegation(self): + """Test apply properly delegates to quant_method.""" + mock_layer = MagicMock() + mock_query = torch.randn(1, 32, 128) + mock_key = torch.randn(1, 32, 128) + mock_value = torch.randn(1, 32, 128) + mock_kv_cache = MagicMock() + mock_attn_metadata = MagicMock() + mock_scale = 1.0 + mock_output = torch.zeros(1, 32, 128) + mock_attn_type = MagicMock() + expected_result = torch.randn(1, 32, 128) + self.mock_quant_method.apply.return_value = expected_result + + result = self.kv_cache_method.apply(mock_layer, mock_query, mock_key, + mock_value, mock_kv_cache, + mock_attn_metadata, mock_attn_type, + mock_scale, mock_output) + + self.mock_quant_method.apply.assert_called_once_with( + mock_layer, mock_query, mock_key, mock_value, mock_kv_cache, + mock_attn_metadata, mock_attn_type, mock_scale, mock_output) + self.assertTrue(torch.equal(result, expected_result)) diff --git a/tests/ut/quantization/test_quantizer.py b/tests/ut/quantization/test_quantizer.py new file mode 100644 index 0000000..a51faee --- /dev/null +++ b/tests/ut/quantization/test_quantizer.py @@ -0,0 +1,145 @@ +from unittest.mock import MagicMock, patch + +from tests.ut.base import TestBase +from vllm_ascend.quantization.quant_config import AscendQuantConfig +from vllm_ascend.quantization.quantizer import (VLLMAscendQuantizer, + W4A8DYNAMICQuantizer, + W8A8Quantizer) + +SUPPORT_ASCEND_QUANTIZER_TYPE = {"test": "1"} + + +class TestGetQuantizer(TestBase): + + def setUp(self): + # Setup common test fixtures + self.supported_types = { + 'INT8': MagicMock(_instance=None), + 'FP16': MagicMock(_instance=None), + 'C8': MagicMock(_instance=None) + } + self.original_supported_types = SUPPORT_ASCEND_QUANTIZER_TYPE.copy() + SUPPORT_ASCEND_QUANTIZER_TYPE.update(self.supported_types) + self.mock_quant_config = MagicMock(spec=AscendQuantConfig) + self.mock_quant_config.quant_description = {"some_config": "value"} + + def tearDown(self): + # Restore original supported types + SUPPORT_ASCEND_QUANTIZER_TYPE.clear() + SUPPORT_ASCEND_QUANTIZER_TYPE.update(self.original_supported_types) + + def test_get_quantizer_fa(self): + """Test successful quantizer retrieval for different cases.""" + # Setup + quant_description = {'fa_quant_type': 'C8'} + prefix = '.attn' + expected_type = 'C8' + with patch.dict( + 'vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE', + SUPPORT_ASCEND_QUANTIZER_TYPE): + + result = VLLMAscendQuantizer.get_quantizer( + quant_description, + prefix, + packed_modules_mapping={"some": "mapping"}) + + # Verify + self.assertIsNotNone(result) + self.assertEqual(result, + self.supported_types[expected_type]._instance) + self.supported_types[expected_type].assert_called_once_with( + quant_description) + + def test_get_quantizer_kv(self): + """Test successful quantizer retrieval for different cases.""" + # Setup + quant_description = {'kv_quant_type': 'C8'} + prefix = '.attn' + expected_type = 'C8' + with patch.dict( + 'vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE', + SUPPORT_ASCEND_QUANTIZER_TYPE): + + result = VLLMAscendQuantizer.get_quantizer( + quant_description, + prefix, + packed_modules_mapping={"some": "mapping"}) + + # Verify + self.assertIsNotNone(result) + self.assertEqual(result, + self.supported_types[expected_type]._instance) + self.supported_types[expected_type].assert_called_once_with( + quant_description) + + def test_get_quantizer_linear(self): + """Test successful quantizer retrieval for different cases.""" + # Setup + quant_description = {'linear_type': 'INT8'} + prefix = 'nothing' + expected_type = 'INT8' + with patch('vllm_ascend.quantization.quantizer.VLLMAscendQuantizer.get_linear_quant_type', + return_value=expected_type), \ + patch.dict('vllm_ascend.quantization.quantizer.SUPPORT_ASCEND_QUANTIZER_TYPE', SUPPORT_ASCEND_QUANTIZER_TYPE): + + result = VLLMAscendQuantizer.get_quantizer( + quant_description, + prefix, + packed_modules_mapping={"some": "mapping"}) + + # Verify + self.assertIsNotNone(result) + self.assertEqual(result, + self.supported_types[expected_type]._instance) + self.supported_types[expected_type].assert_called_once_with( + quant_description) + + +class TestW8A8Quantizer(TestBase): + + def setUp(self): + self.quantizer = W8A8Quantizer(quant_description={}) + + def test_build_linear_method(self): + with patch('vllm_ascend.quantization.quantizer.AscendW8A8LinearMethod', + return_value=MagicMock()) as mock_linear: + result = self.quantizer.build_linear_method() + mock_linear.assert_called_once_with() + self.assertIsInstance(result, MagicMock) + + def test_build_moe_method(self): + with patch( + 'vllm_ascend.quantization.quantizer.AscendW8A8FusedMoEMethod', + return_value=MagicMock()) as mock_linear: + result = self.quantizer.build_moe_method() + mock_linear.assert_called_once_with() + self.assertIsInstance(result, MagicMock) + + def test_build_attention_method(self): + with patch('vllm_ascend.quantization.quantizer.AscendC8KVCacheMethod', + return_value=MagicMock()) as mock_linear: + result = self.quantizer.build_attention_method() + mock_linear.assert_called_once_with() + self.assertIsInstance(result, MagicMock) + + +class TestW4A8DYNAMICQuantizer(TestBase): + + def setUp(self): + self.quantizer = W4A8DYNAMICQuantizer(quant_description={}) + + def test_build_linear_method(self): + with patch( + 'vllm_ascend.quantization.quantizer.AscendW4A8DynamicLinearMethod', + return_value=MagicMock()) as mock_linear: + result = self.quantizer.build_linear_method() + mock_linear.assert_called_once_with() + self.assertIsInstance(result, MagicMock) + + def test_build_moe_method(self): + with patch( + 'vllm_ascend.quantization.quantizer.AscendW4A8DynamicFusedMoEMethod', + return_value=MagicMock()) as mock_fused_moe: + result = self.quantizer.build_moe_method() + mock_fused_moe.assert_called_once_with() + self.assertIsInstance(result, MagicMock) diff --git a/tests/ut/quantization/test_w4a8_dynamic.py b/tests/ut/quantization/test_w4a8_dynamic.py new file mode 100644 index 0000000..d7fdf82 --- /dev/null +++ b/tests/ut/quantization/test_w4a8_dynamic.py @@ -0,0 +1,166 @@ +import copy +from unittest.mock import Mock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.quantization.w4a8_dynamic import ( + AscendW4A8DynamicFusedMoEMethod, AscendW4A8DynamicLinearMethod) + + +class TestAscendW4A8DynamicLinearMethod(TestBase): + + def setUp(self): + self.method = AscendW4A8DynamicLinearMethod() + self.method.group_size = 8 + + def test_get_weight(self): + weight = self.method.get_weight(8, 32, torch.bfloat16) + self.assertEqual(weight["weight"].dtype, torch.int8) + self.assertEqual(weight["weight"].shape, (32, 8)) + + def test_get_pergroup_param(self): + params = self.method.get_pergroup_param(8, 32, torch.bfloat16) + self.assertEqual(params["weight_scale"].dtype, torch.bfloat16) + self.assertEqual(params["weight_scale"].shape, (32, 1)) + self.assertEqual(params["weight_offset"].dtype, torch.bfloat16) + self.assertEqual(params["weight_offset"].shape, (32, 1)) + self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16) + self.assertEqual(params["weight_scale_second"].shape, (32, 1)) + self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16) + self.assertEqual(params["weight_offset_second"].shape, (32, 1)) + + +class TestAscendW4A8DynamicFusedMoEMethod(TestBase): + experts = 8 + input_size = 16 + output_size = 56 + group_size = 2 + + @patch('vllm_ascend.quantization.w4a8_dynamic.get_current_vllm_config') + @patch('vllm_ascend.quantization.w4a8_dynamic.get_ep_group') + @patch('vllm_ascend.quantization.w4a8_dynamic.get_mc2_group') + @patch('torch.distributed.get_rank', return_value=0) + def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ep_group, + get_current_vllm_config): + mock_vllm_config = Mock() + mock_vllm_config.quant_config = Mock(quant_description={ + "group_size": self.group_size, + "version": "0.0.0" + }) + mock_vllm_config.parallel_config = Mock(enable_expert_parallel=True) + get_current_vllm_config.return_value = mock_vllm_config + self.quant_method = AscendW4A8DynamicFusedMoEMethod() + + def test_get_weight(self): + # old quant version w4a8 weight + param_dict = self.quant_method.get_weight(self.experts, + self.input_size, + self.output_size, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) + self.assertEqual(param_dict["w13_weight"].shape, + (self.experts, 2 * self.input_size, self.output_size)) + # new quant version weight + self.quant_method.new_quant_version = True + param_dict = self.quant_method.get_weight(self.experts, + self.input_size, + self.output_size, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) + self.assertEqual(param_dict["w13_weight"].shape, + (self.experts, self.input_size, self.output_size)) + + def test_get_dynamic_quant_param(self): + # old quant version weight + param_dict = self.quant_method.get_dynamic_quant_param( + self.experts, self.input_size, self.output_size, torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale"].shape, + (self.experts, 2 * self.input_size, 1)) + self.assertEqual(param_dict["w13_weight_scale_second"].dtype, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale_second"].shape, + (self.experts, 2 * self.input_size, + self.output_size // self.group_size)) + self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16) + self.assertEqual(param_dict["w2_weight_scale"].shape, + (self.experts, self.output_size, 1)) + self.assertEqual(param_dict["w2_weight_scale_second"].dtype, + torch.bfloat16) + self.assertEqual(param_dict["w2_weight_scale_second"].shape, + (self.experts, self.output_size, + self.input_size // self.group_size)) + # new quant version weight + self.quant_method.new_quant_version = True + param_dict = self.quant_method.get_dynamic_quant_param( + self.experts, self.input_size, self.output_size, torch.bfloat16) + self.assertEqual(param_dict["w2_scale_bias"].dtype, torch.float32) + self.assertEqual( + param_dict["w2_scale_bias"].shape, + (self.experts, self.output_size, 16 // self.quant_method.tp_size)) + + @patch('torch_npu.npu_quantize') + @patch('torch.Tensor.npu') + def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize): + # old quant version weight + layer = torch.nn.Module() + layer.w13_weight = torch.nn.Parameter(torch.zeros( + (self.experts, 2 * self.input_size, self.output_size), + dtype=torch.int8), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(torch.zeros( + (self.experts, self.output_size, self.input_size), + dtype=torch.int8), + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(torch.ones( + (self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16), + requires_grad=False) + layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones( + (self.experts, 2 * self.input_size, + self.output_size // self.group_size), + dtype=torch.bfloat16), + requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter(torch.ones( + (self.experts, self.output_size, 1), dtype=torch.bfloat16), + requires_grad=False) + layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones( + (self.experts, self.output_size, + self.input_size // self.group_size), + dtype=torch.bfloat16), + requires_grad=False) + new_layer = copy.deepcopy(layer) + + mock_npu.return_value = torch.Tensor() + mock_npu_quantize.return_value = torch.Tensor() + self.quant_method.process_weights_after_loading(layer) + self.assertTrue(hasattr(layer, "w13_scale_bias")) + self.assertEqual(layer.w13_scale_bias.data.shape, + (self.experts, 2 * self.input_size)) + self.assertEqual(layer.w13_scale_bias.data.dtype, torch.float32) + self.assertTrue(hasattr(layer, "w2_scale_bias")) + self.assertEqual(layer.w2_scale_bias.data.shape, + (self.experts, self.output_size)) + self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32) + # new quant version weight + self.quant_method.new_quant_version = True + new_layer.w13_weight.data = torch.zeros( + (self.experts, self.input_size, self.output_size), + dtype=torch.int8) + new_layer.w2_weight.data = torch.zeros( + (self.experts, self.output_size // 2, self.input_size), + dtype=torch.int8) + w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1), + dtype=torch.float32) + new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias, + requires_grad=False) + w2_scale_bias = torch.zeros( + (self.experts, self.output_size, 16 // self.quant_method.tp_size), + dtype=torch.float32) + new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias, + requires_grad=False) + self.quant_method.process_weights_after_loading(new_layer) + self.assertEqual(new_layer.w13_scale_bias.data.shape, + (self.experts, 2 * self.input_size)) + self.assertEqual(new_layer.w2_scale_bias.data.shape, + (self.experts, self.output_size)) diff --git a/tests/ut/quantization/test_w8a8.py b/tests/ut/quantization/test_w8a8.py new file mode 100644 index 0000000..90a5f59 --- /dev/null +++ b/tests/ut/quantization/test_w8a8.py @@ -0,0 +1,930 @@ +import unittest +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.ops.layers.experts_selector import (_native_grouped_topk, + select_experts) +from vllm_ascend.quantization.w8a8 import (AscendC8KVCacheMethod, + AscendW8A8FusedMoEMethod, + AscendW8A8LinearMethod, + fused_experts, fused_experts_310p, + quant_per_tensor) + + +class TestQuantPerTensor(TestBase): + + @patch("torch_npu.npu_quantize") + def test_quant_per_tensor(self, mock_npu_quantize): + in_tensor = torch.randn(32, 128) + input_scale = torch.tensor(0.1) + input_offset = torch.tensor(0) + + expected_output = torch.randint(-128, 127, (32, 128), dtype=torch.int8) + mock_npu_quantize.return_value = expected_output + + output = quant_per_tensor(in_tensor, input_scale, input_offset) + + mock_npu_quantize.assert_called_once_with( + in_tensor, + input_scale, + input_offset, + torch.qint8, + -1, + False, + ) + + self.assertTrue(torch.equal(output, expected_output)) + + +class TestAscendW8A8LinearMethod(TestBase): + + def setUp(self): + self.method = AscendW8A8LinearMethod() + + def test_get_weight(self): + weight = self.method.get_weight(10, 20) + self.assertEqual(weight['weight'].dtype, torch.int8) + self.assertEqual(weight['weight'].shape, (20, 10)) + + def test_get_pertensor_param(self): + params = self.method.get_pertensor_param(torch.bfloat16) + self.assertEqual(params['input_scale'].dtype, torch.bfloat16) + self.assertEqual(params['input_offset'].dtype, torch.int8) + self.assertEqual(params['input_scale'].shape, (1, )) + self.assertEqual(params['input_offset'].shape, (1, )) + + def test_get_perchannel_param(self): + params = self.method.get_perchannel_param(10, torch.bfloat16) + + self.assertEqual(params['quant_bias'].dtype, torch.int32) + self.assertEqual(params['deq_scale'].dtype, torch.float32) + self.assertEqual(params['weight_scale'].dtype, torch.bfloat16) + self.assertEqual(params['weight_offset'].dtype, torch.bfloat16) + self.assertEqual(params['quant_bias'].shape, (10, )) + self.assertEqual(params['deq_scale'].shape, (10, )) + self.assertEqual(params['weight_scale'].shape, (10, 1)) + self.assertEqual(params['weight_offset'].shape, (10, 1)) + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + @patch("torch_npu.npu_quant_matmul") + def test_apply_with_x_not_int8(self, mock_npu_quant_matmul, + mock_quant_per_tensor): + layer = MagicMock() + layer.aclnn_input_scale = 0.1 + layer.aclnn_input_offset = 0.2 + layer.weight = torch.randn(128, 256) + layer.deq_scale = 0.3 + + x = torch.randn(32, 128) + bias = torch.randn(256) + mock_quant_per_tensor.return_value = torch.randint(-128, + 127, + x.shape, + dtype=torch.int8) + + expected_y_output = torch.randn(32, 256) + mock_npu_quant_matmul.return_value = expected_y_output + + output = self.method.apply(layer, x, bias) + + expected_y_output += bias + self.assertTrue(torch.equal(output, expected_y_output)) + + @patch("torch_npu.npu_quant_matmul") + def test_apply_with_x_is_int8(self, mock_npu_quant_matmul): + layer = MagicMock() + layer.aclnn_input_scale = 0.1 + layer.aclnn_input_offset = 0.2 + layer.weight = torch.randn(128, 256) + layer.deq_scale = 0.3 + + x = torch.randint(-128, 127, (32, 128), dtype=torch.int8) + bias = torch.randn(256) + + expected_y_output = torch.randn(32, 256) + mock_npu_quant_matmul.return_value = expected_y_output + + output = self.method.apply(layer, x, bias) + expected_y_output += bias + self.assertTrue(torch.equal(output, expected_y_output)) + + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) + @patch("torch_npu.npu_quant_matmul") + def test_apply_with_x_is_310p(self, mock_npu_quant_matmul, mock_is_310p): + layer = MagicMock() + layer.aclnn_input_scale = 0.1 + layer.aclnn_input_offset = 0.2 + layer.weight = torch.randn(128, 256) + layer.deq_scale = 0.3 + + x = torch.randint(-128, 127, (32, 128), dtype=torch.int8) + bias = torch.randn(256) + + expected_y_output = torch.randn(32, 256) + mock_npu_quant_matmul.return_value = expected_y_output + + output = self.method.apply(layer, x, bias) + expected_y_output += bias + self.assertTrue(torch.equal(output, expected_y_output)) + + @patch('torch_npu.npu_format_cast') + def test_process_weights_after_loading(self, mock_npu_format_cast): + layer = MagicMock() + + layer.weight.data = torch.randn(128, 256) + layer.input_scale.data = torch.tensor([0.1]) + layer.input_offset.data = torch.tensor([0]) + layer.deq_scale = torch.tensor([0.5]) + layer.weight_scale.data = torch.randn(128, 1) + layer.weight_offset.data = torch.randn(128, 1) + + mock_npu_format_cast.return_value = MagicMock + self.method.process_weights_after_loading(layer) + + expected_offset = torch.tensor([0]).repeat(256).to(torch.int8) + self.assertTrue( + torch.equal(layer.aclnn_input_offset.data, expected_offset)) + self.assertFalse(layer.aclnn_input_offset.requires_grad) + + self.assertFalse(layer.deq_scale.requires_grad) + + self.assertEqual(layer.weight_scale.data.shape, (128, )) + self.assertEqual(layer.weight_offset.data.shape, (128, )) + + +class TestAscendW8A8FusedMoEMethod(TestBase): + + def setUp(self): + self.moe_method = AscendW8A8FusedMoEMethod() + self.num_experts = 4 + self.intermediate_size = 64 + self.hidden_size = 128 + self.dtype = torch.float32 + + def test_init(self): + self.assertTrue(self.moe_method.transpose_weight) + + def test_get_weight(self): + weights = self.moe_method.get_weight( + num_experts=self.num_experts, + intermediate_size_per_partition=self.intermediate_size, + hidden_sizes=self.hidden_size, + params_dtype=self.dtype) + + assert "w13_weight" in weights, f"w13_weight not in {weights}" + assert "w2_weight" in weights, f"w2_weight not in {weights}" + self.assertEqual( + weights["w13_weight"].shape, + (self.num_experts, 2 * self.intermediate_size, self.hidden_size)) + self.assertEqual( + weights["w2_weight"].shape, + (self.num_experts, self.hidden_size, self.intermediate_size)) + self.assertEqual(weights["w13_weight"].dtype, torch.int8) + self.assertEqual(weights["w2_weight"].dtype, torch.int8) + self.assertFalse(weights["w13_weight"].requires_grad) + self.assertFalse(weights["w2_weight"].requires_grad) + + def test_get_dynamic_quant_param(self): + quant_params = self.moe_method.get_dynamic_quant_param( + num_experts=self.num_experts, + intermediate_size_per_partition=self.intermediate_size, + hidden_sizes=self.hidden_size, + params_dtype=self.dtype) + + expected_params = [ + "w13_weight_scale", "w13_weight_offset", "w2_weight_scale", + "w2_weight_offset", "w2_deq_scale", "w13_deq_scale", + "w2_input_scale", "w13_input_scale", "w2_input_offset", + "w13_input_offset", "quant_bias" + ] + + for param in expected_params: + assert param in quant_params, f"{param} not in {quant_params}" + + # Check some sample shapes + self.assertEqual(quant_params["w13_weight_scale"].shape, + (self.num_experts, 2 * self.intermediate_size, 1)) + self.assertEqual(quant_params["w2_input_offset"].shape, + (self.num_experts, 1)) + self.assertEqual(quant_params["quant_bias"].shape, + (self.num_experts, self.hidden_size)) + + @patch('vllm_ascend.quantization.w8a8.select_experts') + @patch('vllm_ascend.quantization.w8a8.fused_experts') + def test_apply_with_other_expert_count(self, mock_fused_experts, + mock_select_experts): + # Setup + mock_layer = MagicMock() + x = torch.randn(32, self.hidden_size) + router_logits = torch.randn(32, 128) # 128 experts + top_k = 2 + + # Mock return values + mock_select_experts.return_value = (torch.randn(32, top_k), + torch.randint(0, 128, (32, top_k))) + mock_fused_experts.return_value = torch.randn(32, self.hidden_size) + + # Test + result = self.moe_method.apply(layer=mock_layer, + x=x, + router_logits=router_logits, + top_k=top_k, + renormalize=True, + global_num_experts=128) + + # Assertions + mock_select_experts.assert_called_once() + mock_fused_experts.assert_called_once() + self.assertEqual(result.shape, (32, self.hidden_size)) + + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) + @patch('vllm_ascend.quantization.w8a8.select_experts') + @patch('vllm_ascend.quantization.w8a8.fused_experts_310p') + def test_apply_is_310p(self, mock_fused_experts_310p, mock_select_experts, + mock_is_310p): + # Setup + mock_layer = MagicMock() + x = torch.randn(32, self.hidden_size) + router_logits = torch.randn(32, 128) # 128 experts + top_k = 2 + + # Mock return values + mock_select_experts.return_value = (torch.randn(32, top_k), + torch.randint(0, 128, (32, top_k))) + mock_fused_experts_310p.return_value = torch.randn( + 32, self.hidden_size) + + # Test + result = self.moe_method.apply(layer=mock_layer, + x=x, + router_logits=router_logits, + top_k=top_k, + renormalize=True, + global_num_experts=128) + + # Assertions + mock_select_experts.assert_called_once() + mock_fused_experts_310p.assert_called_once() + self.assertEqual(result.shape, (32, self.hidden_size)) + + +class TestAscendC8KVCacheMethod(TestBase): + + def setUp(self): + self.layer = MagicMock() + self.layer.num_kv_heads = 4 + self.layer.head_size = 64 + self.layer.num_heads = 8 + self.layer._k_scale_float = 1.0 + self.layer._v_scale_float = 1.0 + self.method = AscendC8KVCacheMethod() + + self.attention_type = MagicMock() + self.attention_type.DECODER = "decoder" + self.attention_type.ENCODER = "encoder" + + def test_create_weights(self): + """测试 create_weights 是否正确注册参数""" + AscendC8KVCacheMethod.create_weights(self.layer) + + self.layer.register_parameter.assert_any_call("key_antiquant_scale", + unittest.mock.ANY) + self.layer.register_parameter.assert_any_call("value_antiquant_scale", + unittest.mock.ANY) + + calls = self.layer.register_parameter.call_args_list + + for call in calls: + args, kwargs = call + param = kwargs.get('parameter', args[1] if len(args) > 1 else None) + + expected_shape = (self.layer.num_kv_heads * self.layer.head_size, ) + self.assertEqual(param.shape, expected_shape) + + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=False) + def test_process_weights_after_loading_not_310p(self, mock_is_310p): + key_data = torch.ones(4 * 64) + value_data = torch.ones(4 * 64) * 2 + + self.layer.key_antiquant_scale.data = key_data + self.layer.value_antiquant_scale.data = value_data + + self.method.process_weights_after_loading(self.layer) + + self.assertEqual(self.method.antiquant_scale_comb.shape, (2, 256)) + self.assertTrue(torch.all(self.method.antiquant_scale_comb[0] == 1)) + self.assertTrue(torch.all(self.method.antiquant_scale_comb[1] == 2)) + + @patch("vllm_ascend.quantization.w8a8.is_310p", return_value=True) + def test_process_weights_after_loading_is_310p(self, mock_is_310p): + key_data = torch.ones(4 * 64) + value_data = torch.ones(4 * 64) * 2 + + self.layer.key_antiquant_scale.data = key_data + self.layer.value_antiquant_scale.data = value_data + + self.method.process_weights_after_loading(self.layer) + + self.assertEqual(self.method.antiquant_scale_comb.shape, (2, 256)) + self.assertTrue(torch.all(self.method.antiquant_scale_comb[0] == 1)) + self.assertTrue(torch.all(self.method.antiquant_scale_comb[1] == 2)) + + @patch('torch_npu.npu_scatter_nd_update_') + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + def test_apply_decode_only(self, mock_quant, mock_scatter): + + num_tokens = 2 + query = torch.randn(num_tokens, + self.layer.num_heads * self.layer.head_size) + key = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + value = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + output = torch.empty_like(query) + + attn_metadata = MagicMock() + attn_metadata.attn_state = AscendAttentionState.DecodeOnly + attn_metadata.seq_lens = [10, 10] + attn_metadata.block_tables = torch.tensor([[0, 1], [1, 2]]) + attn_metadata.slot_mapping = torch.tensor([0, 1]) + attn_metadata.attn_mask = None + + block_size = 16 + key_cache = torch.empty(2, block_size, self.layer.num_kv_heads, + self.layer.head_size) + value_cache = torch.empty(2, block_size, self.layer.num_kv_heads, + self.layer.head_size) + kv_cache = (key_cache, value_cache) + + mock_quant.side_effect = [key, value] + + self.layer.key_antiquant_scale.data = torch.ones( + self.layer.num_kv_heads * self.layer.head_size) + self.layer.value_antiquant_scale.data = torch.ones( + self.layer.num_kv_heads * self.layer.head_size) + self.method.process_weights_after_loading(self.layer) + + expected_output = torch.randn( + num_tokens, self.layer.num_heads * self.layer.head_size) + with patch('torch_npu.npu_incre_flash_attention', + return_value=expected_output): + result = self.method.apply(self.layer, query, key, value, kv_cache, + attn_metadata, + self.attention_type.DECODER, 1.0, + output) + + self.assertEqual(mock_quant.call_count, 2) + self.assertEqual(mock_scatter.call_count, 2) + self.assertTrue(torch.equal(result, expected_output)) + + @patch('torch_npu.npu_scatter_nd_update_') + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + def test_apply_attn_metadata_without_decode(self, mock_quant, + mock_scatter): + + num_tokens = 2 + query = torch.randn(num_tokens, + self.layer.num_heads * self.layer.head_size) + key = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + value = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + output = torch.empty_like(query) + + attn_metadata = MagicMock(spec=[ + 'attn_state', 'seq_lens', 'block_tables', 'slot_mapping', + 'attn_mask' + ]) + attn_metadata.attn_state = AscendAttentionState.DecodeOnly + attn_metadata.seq_lens = [10, 10] + attn_metadata.block_tables = torch.tensor([[0, 1], [1, 2]]) + attn_metadata.slot_mapping = torch.tensor([0, 1]) + attn_metadata.attn_mask = None + + block_size = 16 + key_cache = torch.empty(2, block_size, self.layer.num_kv_heads, + self.layer.head_size) + value_cache = torch.empty(2, block_size, self.layer.num_kv_heads, + self.layer.head_size) + kv_cache = (key_cache, value_cache) + + mock_quant.side_effect = [key, value] + + self.layer.key_antiquant_scale.data = torch.ones( + self.layer.num_kv_heads * self.layer.head_size) + self.layer.value_antiquant_scale.data = torch.ones( + self.layer.num_kv_heads * self.layer.head_size) + self.method.process_weights_after_loading(self.layer) + + expected_output = torch.randn( + num_tokens, self.layer.num_heads * self.layer.head_size) + with patch('torch_npu.npu_incre_flash_attention', + return_value=expected_output): + result = self.method.apply(self.layer, query, key, value, kv_cache, + attn_metadata, + self.attention_type.DECODER, 1.0, + output) + + self.assertEqual(mock_quant.call_count, 2) + self.assertEqual(mock_scatter.call_count, 2) + self.assertTrue(torch.equal(result, expected_output)) + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + @patch('torch_npu._npu_flash_attention') + def test_apply_prefill_no_cache(self, mock_flash, mock_quant): + """Test apply method in prefill no-cache mode""" + + num_tokens = 2 + query = torch.randn(num_tokens, + self.layer.num_heads * self.layer.head_size) + key = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + value = torch.randn(num_tokens, + self.layer.num_kv_heads * self.layer.head_size) + output = torch.empty_like(query) + + attn_metadata = MagicMock() + attn_metadata.attn_state = AscendAttentionState.PrefillNoCache + attn_metadata.seq_lens = [10, 10] + attn_metadata.attn_mask = torch.ones(2, 2) + + kv_cache = (torch.tensor([]), torch.tensor([])) + mock_quant.return_value = key + + result = self.method.apply(self.layer, query, key, value, kv_cache, + attn_metadata, self.attention_type.DECODER, + 1.0, output) + + # Check that flash attention was called + mock_flash.assert_called_once() + + # Check output shape + self.assertEqual( + result.shape, + (num_tokens, self.layer.num_heads * self.layer.head_size)) + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + def test_apply_unsupported_attention_type(self, mock_quant): + + query = torch.randn(1, self.layer.num_heads * self.layer.head_size) + key = torch.randn(1, self.layer.num_kv_heads * self.layer.head_size) + value = torch.randn(1, self.layer.num_kv_heads * self.layer.head_size) + output = torch.empty_like(query) + + mock_quant.return_value = key + + attn_metadata = MagicMock() + attn_metadata.attn_state = AscendAttentionState.PrefillNoCache + + with self.assertRaises(NotImplementedError) as cm: + self.method.apply(self.layer, query, key, value, (None, None), + attn_metadata, self.attention_type.ENCODER, 1.0, + output) + + assert "Encoder self-attention" in str( + cm.exception), f"Encoder self-attention not in {str(cm.exception)}" + assert "not implemented" in str( + cm.exception), f"not implemented not in{str(cm.exception)}" + + mock_quant.assert_not_called() + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + def test_apply_unsupported_attention_state(self, mock_quant): + """Test apply with unsupported attention state""" + query = torch.randn(1, self.layer.num_heads * self.layer.head_size) + key = torch.randn(1, self.layer.num_kv_heads * self.layer.head_size) + value = torch.randn(1, self.layer.num_kv_heads * self.layer.head_size) + output = torch.empty_like(query) + + attn_metadata = MagicMock() + attn_metadata.attn_state = AscendAttentionState.PrefillCacheHit + mock_quant.return_value = key + kv_cache = (torch.tensor([]), torch.tensor([])) + + with self.assertRaises(NotImplementedError): + self.method.apply(self.layer, query, key, value, kv_cache, + attn_metadata, self.attention_type.DECODER, 1.0, + output) + + +class TestFusedExperts(TestBase): + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + @patch('vllm_ascend.quantization.w8a8.get_ep_group') + @patch('torch_npu.npu_moe_init_routing_v2') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_swiglu') + @patch('torch_npu.npu_moe_finalize_routing') + def test_fused_experts_with_expert_map(self, mock_finalize, mock_swiglu, + mock_group_matmul, + mock_init_routing, + mock_get_ep_group, + mock_quant_per_tensor): + num_tokens = 32 + hidden_size = 128 + intermediate_size = 256 + num_experts = 4 + top_k = 2 + + hidden_states = torch.randn(num_tokens, hidden_size) + + w1 = torch.randn(num_experts, intermediate_size * 2, hidden_size) + w1_scale = torch.tensor([0.1]) + w1_input_scale = torch.tensor([[0.2, 0.2], [0.2, 0.2]]) + w1_input_offset = torch.tensor([0]) + + w2 = torch.randn(num_experts, hidden_size, intermediate_size) + w2_scale = torch.tensor([0.1]) + w2_input_scale = torch.tensor([0.2]) + w2_input_offset = torch.tensor([0]) + + topk_weights = torch.rand(num_tokens, top_k) + topk_ids = torch.randint(0, num_experts, (num_tokens, top_k)) + expert_map = torch.arange(num_experts) + + mock_get_ep_group.return_value.world_size = 8 + + mock_quant_per_tensor.return_value = torch.randint(-128, + 127, + hidden_states.shape, + dtype=torch.int8) + + mock_init_routing.return_value = (torch.randn(num_tokens * top_k, + hidden_size), + torch.arange(num_tokens * top_k), + torch.tensor([num_tokens // 2] * 2), + torch.tensor(1.0)) + + mock_group_matmul.side_effect = [[ + torch.randn(num_tokens * top_k, intermediate_size * 2) + ], [torch.randn(num_tokens * top_k, hidden_size)]] + + mock_swiglu.return_value = torch.randn(num_tokens * top_k, + intermediate_size) + + expected_output = torch.randn(num_tokens, hidden_size) + mock_finalize.return_value = expected_output + + output = fused_experts( + hidden_states=hidden_states, + w1=w1, + w1_scale=w1_scale, + w1_input_scale=w1_input_scale, + w1_input_offset=w1_input_offset, + w2=w2, + w2_scale=w2_scale, + w2_input_scale=w2_input_scale, + w2_input_offset=w2_input_offset, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=num_experts, + expert_map=expert_map, + ) + + mock_init_routing.assert_called_once() + + self.assertEqual(mock_group_matmul.call_count, 2) + + self.assertEqual(output.shape, (num_tokens, hidden_size)) + + mock_finalize.assert_called_once() + + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + @patch('vllm_ascend.quantization.w8a8.get_ep_group') + @patch('torch_npu.npu_grouped_matmul') + @patch('torch_npu.npu_swiglu') + def test_fused_experts_without_expert_map(self, mock_swiglu, + mock_group_matmul, + mock_get_ep_group, + mock_quant_per_tensor): + num_tokens = 16 + hidden_size = 64 + intermediate_size = 128 + num_experts = 8 + top_k = 1 + + hidden_states = torch.randn(num_tokens, hidden_size) + w1 = torch.randn(num_experts, intermediate_size * 2, hidden_size) + w2 = torch.randn(num_experts, hidden_size, intermediate_size) + topk_weights = torch.rand(num_tokens, top_k) + topk_ids = torch.randint(0, num_experts, (num_tokens, top_k)) + + mock_get_ep_group.return_value.world_size = 8 + + mock_quant_per_tensor.return_value = torch.randint(-128, + 127, + hidden_states.shape, + dtype=torch.int8) + mock_group_matmul.side_effect = [[ + torch.randn(num_tokens * top_k, intermediate_size * 2) + ], [torch.randn(num_tokens * top_k, hidden_size)]] + mock_swiglu.return_value = torch.randn(num_tokens * top_k, + intermediate_size) + with self.assertRaises(NotImplementedError): + fused_experts( + hidden_states=hidden_states, + w1=w1, + w1_scale=torch.tensor([0.1]), + w1_input_scale=torch.tensor([[0.2, 0.2], [0.2, 0.2]]), + w1_input_offset=torch.tensor([0]), + w2=w2, + w2_scale=torch.tensor([0.1]), + w2_input_scale=torch.tensor([0.1]), + w2_input_offset=torch.tensor([0]), + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=num_experts, + expert_map=None, + ) + + +class TestFusedExperts310(TestBase): + + @patch('torch_npu.npu_quant_grouped_matmul_dequant') + @patch("vllm_ascend.quantization.w8a8.quant_per_tensor") + @patch('vllm_ascend.quantization.w8a8.get_ep_group') + @patch('torch_npu.npu_swiglu') + def test_fused_experts_310p_with_expert_map(self, mock_swiglu, + mock_get_ep_group, + mock_quant_per_tensor, + mock_matmul_dequant): + num_tokens = 32 + hidden_size = 128 + intermediate_size = 256 + num_experts = 4 + top_k = 1 + + hidden_states = torch.randn(num_tokens, hidden_size) + + w1 = torch.randn(num_experts, intermediate_size * 2, hidden_size) + w1_scale = torch.tensor([0.1]) + w1_input_scale = torch.tensor([[0.2, 0.2], [0.2, 0.2]]) + + w2 = torch.randn(num_experts, hidden_size, intermediate_size) + w2_scale = torch.tensor([0.1]) + w2_input_scale = torch.tensor([0.2]) + + topk_weights = torch.rand(num_tokens, top_k) + topk_ids = torch.randint(0, num_experts, (num_tokens, top_k)) + expert_map = torch.arange(num_experts) + + mock_get_ep_group.return_value.world_size = 1 + + mock_quant_per_tensor.return_value = torch.randint(-128, + 127, + hidden_states.shape, + dtype=torch.int8) + + mock_swiglu.return_value = torch.randn(num_tokens * top_k, + intermediate_size) + + mock_matmul_dequant.return_value = hidden_states + + output = fused_experts_310p( + hidden_states=hidden_states, + w1=w1, + w1_scale=w1_scale, + w1_input_scale=w1_input_scale, + w2=w2, + w2_scale=w2_scale, + w2_input_scale=w2_input_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=num_experts, + expert_map=expert_map, + ) + + self.assertEqual(output.shape, (num_tokens, hidden_size)) + self.assertEqual(mock_matmul_dequant.call_count, 2) + + +class TestSelectExperts(TestBase): + + def setUp(self): + # Common test data + self.num_tokens = 10 + self.hidden_size = 32 + self.num_experts = 8 + self.top_k = 2 + + self.hidden_states = torch.randn(self.num_tokens, self.hidden_size) + self.router_logits = torch.randn(self.num_tokens, self.num_experts) + + @patch('torch_npu.npu_moe_gating_top_k_softmax') + def test_softmax_scoring(self, mock_topk): + """Test softmax scoring function""" + mock_topk.return_value = (torch.ones(self.num_tokens, self.top_k), + torch.zeros(self.num_tokens, + self.top_k, + dtype=torch.long), + torch.arange(0, + self.num_tokens * self.top_k, + dtype=torch.int32).view( + self.top_k, + -1).permute(1, + 0).contiguous()) + + weights, ids, _ = select_experts(hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=False, + scoring_func="softmax") + + self.assertEqual(weights.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.shape, (self.num_tokens, self.top_k)) + + def test_sigmoid_scoring(self): + """Test sigmoid scoring function""" + + weights, ids, _ = select_experts(hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=False, + scoring_func="sigmoid") + + self.assertEqual(weights.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.shape, (self.num_tokens, self.top_k)) + + def test_invalid_scoring_func(self): + """Test invalid scoring function raises ValueError""" + with self.assertRaises(ValueError): + select_experts(hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=False, + scoring_func="invalid_func") + + @patch('torch.topk') + def test_grouped_topk(self, mock_topk): + """Test grouped topk functionality""" + mock_topk.return_value = (torch.ones(self.num_tokens, self.top_k), + torch.zeros(self.num_tokens, + self.top_k, + dtype=torch.long)) + + weights, ids, _ = select_experts(hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=True, + renormalize=False, + topk_group=4, + num_expert_group=2) + + mock_topk.assert_called() + self.assertEqual(weights.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.dtype, torch.int32) + + @patch('vllm_ascend.ops.layers.experts_selector._native_grouped_topk') + def test_grouped_topk_with_correction_bias(self, mock_grouped_topk): + """Test grouped topk with expert score correction bias""" + mock_grouped_topk.return_value = torch.ones(self.num_tokens, + self.num_experts) + + e_score_correction_bias = torch.randn(self.num_experts) + weights, ids, _ = select_experts( + hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=True, + renormalize=False, + topk_group=4, + num_expert_group=2, + e_score_correction_bias=e_score_correction_bias) + + mock_grouped_topk.assert_called_once() + self.assertEqual(weights.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.shape, (self.num_tokens, self.top_k)) + + def test_custom_routing_function(self): + """Test custom routing function""" + mock_custom_routing = MagicMock() + mock_custom_routing.return_value = (torch.ones(self.num_tokens, + self.top_k), + torch.zeros(self.num_tokens, + self.top_k, + dtype=torch.int32)) + + weights, ids, _ = select_experts( + hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=False, + custom_routing_function=mock_custom_routing) + + mock_custom_routing.assert_called_once() + self.assertEqual(weights.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.shape, (self.num_tokens, self.top_k)) + self.assertEqual(ids.dtype, torch.int32) + + @patch('torch_npu.npu_moe_gating_top_k_softmax') + def test_renormalize(self, mock_topk): + """Test renormalization""" + mock_topk.return_value = (torch.ones(self.num_tokens, self.top_k), + torch.zeros(self.num_tokens, + self.top_k, + dtype=torch.long), + torch.arange(0, + self.num_tokens * self.top_k, + dtype=torch.int32).view( + self.top_k, + -1).permute(1, + 0).contiguous()) + + weights, ids, _ = select_experts( + hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=True, + ) + + # Check if weights are normalized (sum to 1 for each token) + sums = weights.sum(dim=-1) + self.assertTrue(torch.allclose(sums, torch.ones_like(sums))) + + @patch('torch_npu.npu_moe_gating_top_k_softmax') + def test_output_dtypes(self, mock_topk): + """Test output dtypes""" + mock_topk.return_value = (torch.ones(self.num_tokens, self.top_k), + torch.zeros(self.num_tokens, + self.top_k, + dtype=torch.long), + torch.arange(0, + self.num_tokens * self.top_k, + dtype=torch.int32).view( + self.top_k, + -1).permute(1, + 0).contiguous()) + + weights, ids, _ = select_experts( + hidden_states=self.hidden_states, + router_logits=self.router_logits, + top_k=self.top_k, + use_grouped_topk=False, + renormalize=False, + ) + + self.assertEqual(weights.dtype, self.hidden_states.dtype) + self.assertEqual(ids.dtype, torch.int32) + + +class TestNativeGroupedTopkPartialMock(TestBase): + + def test_basic_group_selection(self): + topk_weights = torch.tensor([[0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6], + [0.6, 0.4, 0.7, 0.3, 0.8, 0.2, 0.9, 0.1], + [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], + [0.9, 0.1, 0.8, 0.2, 0.7, 0.3, 0.6, 0.4]], + dtype=torch.float32) + + expected_topk_indices = torch.tensor([[0, 1], [1, 0], [0, 1], [0, 1]]) + + with patch('torch.topk', + return_value=(None, expected_topk_indices)) as mock_topk: + result = _native_grouped_topk(topk_weights=topk_weights, + num_expert_group=2, + topk_group=2) + + mock_topk.assert_called_once() + + expected_result = topk_weights + self.assertTrue(torch.allclose(result, expected_result)) + + def test_partial_group_selection(self): + + topk_weights = torch.tensor([[0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6], + [0.6, 0.4, 0.7, 0.3, 0.8, 0.2, 0.9, 0.1]]) + + expected_topk_indices = torch.tensor([[0], [1]]) + + with patch('torch.topk', return_value=(None, expected_topk_indices)): + result = _native_grouped_topk(topk_weights=topk_weights, + num_expert_group=2, + topk_group=1) + + expected_result = torch.tensor( + [[0.1, 0.9, 0.2, 0.8, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.8, 0.2, 0.9, 0.1]]) + self.assertTrue(torch.allclose(result, expected_result)) + + def test_single_group(self): + topk_weights = torch.tensor([[0.1, 0.9, 0.2], [0.8, 0.3, 0.7]]) + + expected_topk_indices = torch.tensor([[0], [0]]) + + with patch('torch.topk', return_value=(None, expected_topk_indices)): + result = _native_grouped_topk(topk_weights=topk_weights, + num_expert_group=1, + topk_group=1) + self.assertTrue(result.numel() > 0) diff --git a/tests/ut/sample/test_rejection_sampler.py b/tests/ut/sample/test_rejection_sampler.py new file mode 100644 index 0000000..adbf376 --- /dev/null +++ b/tests/ut/sample/test_rejection_sampler.py @@ -0,0 +1,203 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from unittest.mock import patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.sample.rejection_sampler import ( + expand_batch_to_tokens, expand_pytorch, rejection_greedy_sample_pytorch, + rejection_random_sample_pytorch, sample_recovered_tokens_pytorch) + +# Global constants +PLACEHOLDER_TOKEN_ID = -1 +GREEDY_TEMPERATURE = 0.0 +MAX_SPEC_LEN = 8 # Used as MAX_NUM_TOKENS in expand_batch_to_tokens + + +class TestAscendRejectionSampler(TestBase): + + def test_rejection_greedy_sample_pytorch(self): + """Test greedy rejection sampling: stop when draft doesn't match, otherwise append bonus token""" + batch_size = 2 + max_spec_len = 2 + output_token_ids = torch.full((batch_size, max_spec_len + 1), + PLACEHOLDER_TOKEN_ID) + + cu_num_draft_tokens = torch.tensor([2, 4]) + num_draft_tokens = [2, 2] + draft_token_ids = torch.tensor([10, 11, 20, 21]) + target_argmax = torch.tensor([10, 99, 20, 22]) + bonus_token_ids = torch.tensor([[100], [200]]) + + is_greedy = torch.tensor([True, True]) + + rejection_greedy_sample_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + target_argmax, + bonus_token_ids, + num_draft_tokens, + max_spec_len, + is_greedy, + ) + + assert output_token_ids[0, 0].item() == 10 + assert output_token_ids[0, 1].item() == 99 + assert output_token_ids[1, 0].item() == 20 + assert output_token_ids[1, 2].item() == PLACEHOLDER_TOKEN_ID + + def test_rejection_random_sample_pytorch(self): + """Test random rejection sampling: accept based on uniform probability""" + batch_size = 2 + max_spec_len = 3 + output_token_ids = torch.full((batch_size, max_spec_len + 1), + PLACEHOLDER_TOKEN_ID) + + cu_num_draft_tokens = torch.tensor([2, 1]) + draft_token_ids = torch.tensor([1, 0, 2]) + draft_probs = torch.tensor([ + [0.0, 0.6, 0.0, 0.4], # vocab_size=4 + [0.1, 0.2, 0.3, 0.4], + [0.5, 0.5, 0.0, 0.0], + ]) + target_probs = torch.tensor([ + [0.0, 0.8, 0.0, 0.2], + [0.2, 0.1, 0.3, 0.4], + [0.9, 0.1, 0.0, 0.0], + ]) + bonus_token_ids = torch.tensor([[100], [200]]) + recovered_token_ids = torch.tensor([1, 2, 3]) + uniform_probs = torch.tensor([0.7, 0.6, 0.5]) + is_greedy = torch.tensor([False, False]) + vocab_size = 4 + + rejection_random_sample_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + bonus_token_ids, + recovered_token_ids, + uniform_probs, + is_greedy, + max_spec_len, + vocab_size, + IS_NGRAM=False, + ) + + assert output_token_ids[0, 0].item() == 1 + assert output_token_ids[0, 1].item() == 0 + assert output_token_ids[0, 2].item() == 100 + + def test_expand_pytorch(self): + """Test expand_pytorch functionality""" + input_ptr = torch.tensor([10, 20, 30], dtype=torch.int32) + cu_num_tokens_ptr = torch.tensor([2, 5, 7]) + output_ptr = torch.empty(7, dtype=torch.int32) + + expand_pytorch( + output_ptr, + input_ptr, + cu_num_tokens_ptr, + replace_from=0, + replace_to=0, + MAX_NUM_TOKENS=MAX_SPEC_LEN, + ) + + expected = torch.tensor([10, 10, 20, 20, 20, 30, 30]) + assert torch.equal(output_ptr, expected) + + def test_expand_batch_to_tokens(self): + """Test expand_batch_to_tokens wrapper""" + x = torch.tensor([10, 20, 30]) + cu_num_tokens = torch.tensor([2, 5, 7]) + num_tokens = 7 + + with patch("vllm_ascend.sample.rejection_sampler.expand_pytorch" + ) as mock_kernel: + expand_batch_to_tokens(x, cu_num_tokens, num_tokens) + mock_kernel.assert_called_once() + args = mock_kernel.call_args[0] + assert (args[1] == x).all() + assert (args[2] == cu_num_tokens).all() + + # Run actual function + result = expand_batch_to_tokens(x, cu_num_tokens, num_tokens) + expected = torch.tensor([10, 10, 20, 20, 20, 30, 30]) + assert torch.equal(result, expected) + + def test_sample_recovered_tokens_pytorch_ngram(self): + """Test recovered token sampling under n-gram mode""" + output_token_ids = torch.empty(2, dtype=torch.int32) + cu_num_draft_tokens = torch.tensor([1, 2]) + draft_token_ids = torch.tensor([1, 2]) + draft_probs = None + target_probs = torch.tensor([ + [0.1, 0.2, 0.7], + [0.3, 0.3, 0.4], + ]) + q = torch.tensor([ + [0.1, 0.2, 0.7], + [0.5, 0.4, 0.1], + ]) + vocab_size = 3 + + sample_recovered_tokens_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + q, + vocab_size, + IS_NGRAM=True, + ) + + assert output_token_ids[0].item() == 0 + assert output_token_ids[1].item() == 1 + + def test_sample_recovered_tokens_pytorch_autoregressive(self): + """Test recovered token sampling for autoregressive models""" + output_token_ids = torch.empty(2, dtype=torch.int32) + cu_num_draft_tokens = torch.tensor([1, 1]) + draft_token_ids = torch.tensor([0, 1]) + draft_probs = torch.tensor([ + [0.6, 0.1, 0.3], + [0.2, 0.7, 0.1], + ]) + target_probs = torch.tensor([ + [0.8, 0.1, 0.1], + [0.3, 0.6, 0.1], + ]) + q = torch.tensor([ + [0.5, 0.3, 0.2], + [0.1, 0.8, 0.1], + ]) + vocab_size = 3 + + sample_recovered_tokens_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + q, + vocab_size, + IS_NGRAM=False, + ) + assert output_token_ids[0].item() == 0 diff --git a/tests/ut/sample/test_sampler.py b/tests/ut/sample/test_sampler.py new file mode 100644 index 0000000..98a83e6 --- /dev/null +++ b/tests/ut/sample/test_sampler.py @@ -0,0 +1,32 @@ +from unittest import mock + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.sample.sampler import AscendSampler, AscendTopKTopPSampler + + +class TestAscendSampler(TestBase): + + def test_init_with_raw_logprobs(self): + sampler = AscendSampler(logprobs_mode="raw_logprobs") + self.assertEqual(sampler.logprobs_mode, "raw_logprobs") + self.assertTrue(hasattr(sampler, 'topk_topp_sampler')) + self.assertIsInstance(sampler.topk_topp_sampler, AscendTopKTopPSampler) + + +class TestAscendTopKTopPSampler(TestBase): + + @mock.patch("torch_npu.npu_top_k_top_p") + def test_npu_topk_topp_called_when_optimized(self, mock_npu_op): + mock_npu_op.return_value = (torch.randn(1, 3)) + sampler = AscendTopKTopPSampler() + + logits = torch.tensor([[1.0, 2.0, 3.0]]) + k = torch.tensor([2]) + p = torch.tensor([0.9]) + generators = {0: torch.Generator()} + generators[0].manual_seed(42) + + sampler.forward_native(logits, generators, k, p) + mock_npu_op.assert_called_once_with(logits, p, k) diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py new file mode 100644 index 0000000..4c7cfa6 --- /dev/null +++ b/tests/ut/test_ascend_config.py @@ -0,0 +1,361 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import os + +from transformers import PretrainedConfig +from vllm.config import ModelConfig, ParallelConfig, VllmConfig + +from tests.ut.base import TestBase +from vllm_ascend.ascend_config import (_check_torchair_supported, + check_ascend_config, + clear_ascend_config, get_ascend_config, + init_ascend_config) + + +class TestAscendConfig(TestBase): + + @staticmethod + def _clean_up_ascend_config(func): + + def wrapper(*args, **kwargs): + clear_ascend_config() + func(*args, **kwargs) + clear_ascend_config() + + return wrapper + + @_clean_up_ascend_config + def test_init_ascend_config_without_additional_config(self): + test_vllm_config = VllmConfig() + # No additional config given, check the default value here. + ascend_config = init_ascend_config(test_vllm_config) + self.assertIsNone(ascend_config.expert_map_path) + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertFalse(torchair_graph_config.enabled) + self.assertEqual(torchair_graph_config.mode, '') + self.assertFalse(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, []) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertFalse(torchair_graph_config.enable_multistream_mla) + self.assertFalse(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertFalse(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertFalse(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_additional_config(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": True, + "graph_batch_sizes": [1, 2, 4], + "graph_batch_sizes_init": False, + "enable_multistream_mla": True, + "enable_multistream_moe": True, + "enable_view_optimize": True, + "enable_kv_nz": True + }, + "ascend_scheduler_config": { + "enabled": True + }, + "expert_map_path": "test_expert_map_path", + "refresh": True, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(ascend_config.expert_map_path, "test_expert_map_path") + + torchair_graph_config = ascend_config.torchair_graph_config + self.assertTrue(torchair_graph_config.enabled) + self.assertTrue(torchair_graph_config.use_cached_graph) + self.assertEqual(torchair_graph_config.graph_batch_sizes, [1, 2, 4]) + self.assertFalse(torchair_graph_config.graph_batch_sizes_init) + self.assertTrue(torchair_graph_config.enable_multistream_mla) + self.assertTrue(torchair_graph_config.enable_multistream_moe) + self.assertTrue(torchair_graph_config.enable_view_optimize) + self.assertTrue(torchair_graph_config.enable_kv_nz) + + ascend_scheduler_config = ascend_config.ascend_scheduler_config + self.assertTrue(ascend_scheduler_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_refresh(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertFalse(ascend_config.torchair_graph_config.enabled) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True, + } + ascend_config = init_ascend_config(test_vllm_config) + self.assertTrue(ascend_config.torchair_graph_config.enabled) + + @_clean_up_ascend_config + def test_init_ascend_config_with_wrong_input(self): + test_vllm_config = VllmConfig() + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "graph_batch_sizes": "fake_size", + }, + "refresh": True, + } + with self.assertRaises(TypeError): + init_ascend_config(test_vllm_config) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "graph_batch_sizes": [1, 2, 4, 8], + "graph_batch_sizes_init": True, + }, + "refresh": True, + } + with self.assertRaises(ValueError): + init_ascend_config(test_vllm_config) + + @_clean_up_ascend_config + def test_get_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + + @_clean_up_ascend_config + def test_get_ascend_config_without_init(self): + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_clear_ascend_config(self): + test_vllm_config = VllmConfig() + ascend_config = init_ascend_config(test_vllm_config) + self.assertEqual(get_ascend_config(), ascend_config) + clear_ascend_config() + with self.assertRaises(RuntimeError): + get_ascend_config() + + @_clean_up_ascend_config + def test_check_ascend_config_pass(self): + test_vllm_config = VllmConfig() + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + @_clean_up_ascend_config + def test_check_ascend_config_wrong_case(self): + test_vllm_config = VllmConfig() + + # torchair + eager mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + enforce_eager = True + check_ascend_config(test_vllm_config, enforce_eager) + # torchair + non deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "llama" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + # aclgraph + deepseek model + with self.assertRaises(NotImplementedError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + }, + "refresh": True + } + model_path = os.path.join(os.path.dirname(__file__), "fake_weight") + fake_model_config = ModelConfig(model=model_path) + fake_model_config.hf_config = PretrainedConfig() + fake_model_config.hf_config.model_type = "deepseek" + test_vllm_config.model_config = fake_model_config + init_ascend_config(test_vllm_config) + check_ascend_config(test_vllm_config, False) + + def test_check_torchair_supported(self): + test_cases = [('deepseek_v3', True), ('PanguProMoE', True), + ('qwen', True), ('llama', False)] + for model_type, expected_output in test_cases: + self.assertEqual(_check_torchair_supported(model_type), + expected_output) + + @_clean_up_ascend_config + def test_ascend_config_load_error(self): + test_vllm_config = VllmConfig() + # graph_batch_sizes should be list. + with self.assertRaises(TypeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "graph_batch_sizes": "fake_size", + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # use_cached_graph should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "use_cached_graph": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # use_cached_kv_cache_bytes should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "use_cached_kv_cache_bytes": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # graph_batch_sizes should not be set without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "graph_batch_sizes": [1, 2, 4], + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # use_cached_kv_cache_bytes is valid only when torchair graph mode and use_cached_graph are enabled + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": True, + "use_cached_graph": False, + "use_cached_kv_cache_bytes": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # graph_batch_sizes_init should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "graph_batch_sizes_init": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # enable_multistream_mla should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "enable_multistream_mla": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # enable_multistream_moe should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "enable_multistream_moe": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # mode should not be configured without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "mode": 'max-autotune', + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + # enable_kv_nz should not be enabled without torchair graph mode + with self.assertRaises(RuntimeError): + test_vllm_config.additional_config = { + "torchair_graph_config": { + "enabled": False, + "enable_kv_nz": True, + }, + "refresh": True + } + init_ascend_config(test_vllm_config) + + with self.assertRaises(AssertionError): + test_vllm_config.additional_config = { + "lmhead_tensor_parallel_size": 2, + "refresh": True + } + test_vllm_config.parallel_config = ParallelConfig( + data_parallel_size=4, tensor_parallel_size=2) + init_ascend_config(test_vllm_config) diff --git a/tests/ut/test_envs.py b/tests/ut/test_envs.py new file mode 100644 index 0000000..95c45be --- /dev/null +++ b/tests/ut/test_envs.py @@ -0,0 +1,62 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +import inspect +import os + +import vllm_ascend.envs as envs_ascend +from tests.ut.base import TestBase + + +class TestEnvVariables(TestBase): + + def setUp(self): + self.env_vars = list(envs_ascend.env_variables.keys()) + + def test_env_vars_behavior(self): + for var_name in self.env_vars: + with self.subTest(var=var_name): + original_val = os.environ.get(var_name) + var_handler = envs_ascend.env_variables[var_name] + + try: + if var_name in os.environ: + del os.environ[var_name] + self.assertEqual(getattr(envs_ascend, var_name), + var_handler()) + + handler_source = inspect.getsource(var_handler) + if 'int(' in handler_source: + test_vals = ["123", "456"] + elif 'bool(int(' in handler_source: + test_vals = ["0", "1"] + else: + test_vals = [f"test_{var_name}", f"custom_{var_name}"] + + for test_val in test_vals: + os.environ[var_name] = test_val + self.assertEqual(getattr(envs_ascend, var_name), + var_handler()) + + finally: + if original_val is None: + os.environ.pop(var_name, None) + else: + os.environ[var_name] = original_val + + def test_dir_and_getattr(self): + self.assertEqual(sorted(envs_ascend.__dir__()), sorted(self.env_vars)) + for var_name in self.env_vars: + with self.subTest(var=var_name): + getattr(envs_ascend, var_name) diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py new file mode 100644 index 0000000..de8b9be --- /dev/null +++ b/tests/ut/test_platform.py @@ -0,0 +1,714 @@ +import importlib +import unittest +from datetime import timedelta +from unittest.mock import MagicMock, patch + +import pytest +import torch +from torch.distributed import ProcessGroup +from torch.distributed.distributed_c10d import PrefixStore +from vllm.config import CompilationLevel +from vllm.config.compilation import CUDAGraphMode +from vllm.platforms import PlatformEnum + +from tests.ut.base import TestBase +from vllm_ascend.platform import NPUPlatform +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD + + +class TestNPUPlatform(TestBase): + + @staticmethod + def mock_vllm_config(): + mock_vllm_config = MagicMock() + mock_vllm_config.compilation_config = MagicMock() + mock_vllm_config.model_config = MagicMock() + mock_vllm_config.parallel_config = MagicMock() + mock_vllm_config.cache_config = MagicMock() + mock_vllm_config.scheduler_config = MagicMock() + mock_vllm_config.speculative_config = None + mock_vllm_config.compilation_config.pass_config.enable_sequence_parallelism = False + mock_vllm_config.compilation_config.cudagraph_mode = None + return mock_vllm_config + + @staticmethod + def mock_vllm_ascend_config(): + mock_ascend_config = MagicMock() + mock_ascend_config.torchair_graph_config.enabled = False + mock_ascend_config.ascend_scheduler_config.enabled = False + return mock_ascend_config + + def setUp(self): + self.platform = NPUPlatform() + + def test_class_variables(self): + self.assertEqual(NPUPlatform._enum, PlatformEnum.OOT) + self.assertEqual(NPUPlatform.device_name, "npu") + self.assertEqual(NPUPlatform.device_type, "npu") + self.assertEqual(NPUPlatform.simple_compile_backend, "eager") + self.assertEqual(NPUPlatform.ray_device_key, "NPU") + self.assertEqual(NPUPlatform.device_control_env_var, + "ASCEND_RT_VISIBLE_DEVICES") + self.assertEqual(NPUPlatform.dispatch_key, "PrivateUse1") + self.assertEqual(NPUPlatform.supported_quantization, + [ASCEND_QUANTIZATION_METHOD]) + + def test_is_sleep_mode_available(self): + self.assertTrue(self.platform.is_sleep_mode_available()) + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.quantization.quant_config.AscendQuantConfig") + def test_pre_register_and_update_with_parser(self, mock_quant_config, + mock_adapt_patch): + mock_parser = MagicMock() + mock_action = MagicMock() + mock_action.choices = ["awq", "gptq"] + mock_parser._option_string_actions = {"--quantization": mock_action} + + self.platform.pre_register_and_update(mock_parser) + + mock_adapt_patch.assert_called_once_with(is_global_patch=True) + + self.assertTrue(ASCEND_QUANTIZATION_METHOD in mock_action.choices) + self.assertEqual(len(mock_action.choices), 3) # original 2 + ascend + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.quantization.quant_config.AscendQuantConfig") + def test_pre_register_and_update_without_parser(self, mock_quant_config, + mock_adapt_patch): + self.platform.pre_register_and_update(None) + + mock_adapt_patch.assert_called_once_with(is_global_patch=True) + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.quantization.quant_config.AscendQuantConfig") + def test_pre_register_and_update_with_parser_no_quant_action( + self, mock_quant_config, mock_adapt_patch): + mock_parser = MagicMock() + mock_parser._option_string_actions = {} + + self.platform.pre_register_and_update(mock_parser) + + mock_adapt_patch.assert_called_once_with(is_global_patch=True) + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.quantization.quant_config.AscendQuantConfig") + def test_pre_register_and_update_with_existing_ascend_quant( + self, mock_quant_config, mock_adapt_patch): + mock_parser = MagicMock() + mock_action = MagicMock() + mock_action.choices = ["awq", ASCEND_QUANTIZATION_METHOD] + mock_parser._option_string_actions = {"--quantization": mock_action} + + self.platform.pre_register_and_update(mock_parser) + + mock_adapt_patch.assert_called_once_with(is_global_patch=True) + self.assertEqual(len(mock_action.choices), 2) + + def test_get_device_capability(self): + self.assertIsNone(self.platform.get_device_capability(device_id=0)) + + @patch("torch.npu.get_device_name") + def test_get_device_name(self, mock_get_device_name): + device_id = 0 + device_name = "Ascend910B2" + mock_get_device_name.return_value = device_name + self.assertEqual(self.platform.get_device_name(device_id), device_name) + mock_get_device_name.assert_called_once_with(0) + + def test_is_async_output_supported(self): + self.assertTrue( + self.platform.is_async_output_supported(enforce_eager=None)) + self.assertTrue( + self.platform.is_async_output_supported(enforce_eager=True)) + self.assertTrue( + self.platform.is_async_output_supported(enforce_eager=False)) + + @patch("torch.inference_mode") + def test_inference_mode(self, mock_inference_mode): + mock_inference_mode.return_value = None + self.assertIsNone(self.platform.inference_mode()) + mock_inference_mode.assert_called_once() + + @patch("torch.npu.set_device") + def test_set_device_normal(self, mock_set_device): + device = torch.device("npu:0") + self.platform.set_device(device) + mock_set_device.assert_called_once_with(device) + + @patch("torch.npu.set_device", + side_effect=RuntimeError("Device not available")) + def test_set_device_failure(self, mock_set_device): + device = torch.device("npu:0") + with self.assertRaises(RuntimeError): + self.platform.set_device(device) + mock_set_device.assert_called_once_with(device) + + @patch("torch.npu.empty_cache") + def test_empty_cache_normal(self, mock_empty_cache): + self.platform.empty_cache() + mock_empty_cache.assert_called_once() + + @patch("torch.npu.empty_cache", + side_effect=RuntimeError("Cache clearing failed")) + def test_empty_cache_failure(self, mock_empty_cache): + with self.assertRaises(RuntimeError): + self.platform.empty_cache() + mock_empty_cache.assert_called_once() + + @patch("torch.npu.synchronize") + def test_synchronize_normal(self, mock_synchronize): + self.platform.synchronize() + mock_synchronize.assert_called_once() + + @patch("torch.npu.synchronize", + side_effect=RuntimeError("Synchronization failed")) + def test_synchronize_failure(self, mock_synchronize): + with self.assertRaises(RuntimeError): + self.platform.synchronize() + mock_synchronize.assert_called_once() + + @patch("torch.npu.mem_get_info") + def test_mem_get_info_normal(self, mock_mem_get_info): + free_memory_size = 1024 + total_memory_size = 2048 + memory_info = (free_memory_size, total_memory_size) + mock_mem_get_info.return_value = memory_info + result = self.platform.mem_get_info() + self.assertIsInstance(result, tuple) + self.assertEqual(len(result), 2) + self.assertEqual(result, memory_info) + mock_mem_get_info.assert_called_once() + + @patch("torch.npu.mem_get_info", + side_effect=RuntimeError("NPU not available")) + def test_mem_get_info_failure(self, mock_mem_get_info): + with self.assertRaises(RuntimeError): + self.platform.mem_get_info() + mock_mem_get_info.assert_called_once() + + @patch("gc.collect") + @patch("torch.npu.empty_cache") + @patch("torch.npu.reset_peak_memory_stats") + def test_clear_npu_memory_normal(self, mock_reset_stats, mock_empty_cache, + mock_gc_collect): + self.platform.clear_npu_memory() + + mock_gc_collect.assert_called_once() + mock_empty_cache.assert_called_once() + mock_reset_stats.assert_called_once() + + @patch("gc.collect", side_effect=Exception("GC failed")) + @patch("torch.npu.empty_cache") + @patch("torch.npu.reset_peak_memory_stats") + def test_clear_npu_memory_gc_collect_failure(self, mock_reset_stats, + mock_empty_cache, + mock_gc_collect): + with self.assertRaises(Exception): + self.platform.clear_npu_memory() + + mock_gc_collect.assert_called_once() + mock_empty_cache.assert_not_called() + mock_reset_stats.assert_not_called() + + @patch("gc.collect") + @patch("torch.npu.empty_cache", + side_effect=RuntimeError("Cache clear failed")) + @patch("torch.npu.reset_peak_memory_stats") + def test_clear_npu_memory_empty_cache_failure(self, mock_reset_stats, + mock_empty_cache, + mock_gc_collect): + with self.assertRaises(RuntimeError): + self.platform.clear_npu_memory() + + mock_gc_collect.assert_called_once() + mock_empty_cache.assert_called_once() + mock_reset_stats.assert_not_called() + + @patch("gc.collect") + @patch("torch.npu.empty_cache") + @patch("torch.npu.reset_peak_memory_stats", + side_effect=RuntimeError("Reset failed")) + def test_clear_npu_memory_reset_stats_failure(self, mock_reset_stats, + mock_empty_cache, + mock_gc_collect): + with self.assertRaises(RuntimeError): + self.platform.clear_npu_memory() + + mock_gc_collect.assert_called_once() + mock_empty_cache.assert_called_once() + mock_reset_stats.assert_called_once() + + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + @patch("vllm_ascend.utils.update_aclgraph_sizes") + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("os.environ", {}) + def test_check_and_update_config_basic_config_update( + self, mock_is_310p, mock_update_acl, mock_init_ascend, + mock_check_ascend): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.parallel_config.enable_expert_parallel = False + + # Use importlib.reload to reload the platform module, ensuring the mocked init_ascend_config method is used. + # Without this reload, when calling self.platform.check_and_update_config, + # it would execute the original unmocked init_ascend_config method, causing the unit test to fail. + from vllm_ascend import platform + + importlib.reload(platform) + + self.platform.check_and_update_config(vllm_config) + + mock_init_ascend.assert_called_once_with(vllm_config) + mock_check_ascend.assert_called_once() + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_no_model_config_warning( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config = None + + with self.assertLogs(logger="vllm", level="WARNING") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + self.assertTrue("Model config is missing" in cm.output[0]) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_enforce_eager_mode( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config.enforce_eager = True + + with self.assertLogs(logger="vllm", level="INFO") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + self.assertTrue("Compilation disabled, using eager mode by default" in + cm.output[0]) + self.assertEqual( + vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_unsupported_compilation_level( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config.enforce_eager = False + vllm_config.compilation_config.level = CompilationLevel.DYNAMO_ONCE + + with self.assertLogs(logger="vllm", level="WARNING") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + self.assertTrue("NPU does not support" in cm.output[0]) + self.assertEqual( + vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @pytest.mark.skip( + "Revert me when vllm support setting cudagraph_mode on oot platform") + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_unsupported_cudagraph_mode( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config.enforce_eager = False + vllm_config.compilation_config.cudagraph_mode = CUDAGraphMode.FULL + + with self.assertLogs(logger="vllm", level="INFO") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + self.assertTrue( + "cudagraph_mode is not support on NPU. falling back to NONE" in + cm.output[0]) + self.assertEqual( + vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_disable_aclgraph_when_ray_enabled( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config.enforce_eager = False + vllm_config.compilation_config.level = CompilationLevel.PIECEWISE + vllm_config.parallel_config.distributed_executor_backend = "ray" + + with self.assertLogs(logger="vllm", level="WARNING") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + print(30 * "=", f"cm.output: {cm.output}") + self.assertTrue( + "Ray distributed executor backend is not compatible with ACL Graph mode" + in cm.output[0]) + self.assertEqual( + vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_torchair_enabled_compilation( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() + mock_ascend_config.torchair_graph_config.enabled = True + mock_init_ascend.return_value = mock_ascend_config + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.model_config.enforce_eager = False + vllm_config.compilation_config.level = CompilationLevel.PIECEWISE + + with self.assertLogs(logger="vllm", level="INFO") as cm: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + self.assertTrue("Torchair compilation enabled" in cm.output[0]) + self.assertEqual( + vllm_config.compilation_config.level, + CompilationLevel.NO_COMPILATION, + ) + self.assertEqual( + vllm_config.compilation_config.cudagraph_mode, + CUDAGraphMode.NONE, + ) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_cache_config_block_size( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.cache_config.block_size = None + vllm_config.cache_config.enable_prefix_caching = True + + from vllm_ascend import platform + + importlib.reload(platform) + + self.platform.check_and_update_config(vllm_config) + + self.assertEqual(vllm_config.cache_config.block_size, 128) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_v1_worker_class_selection( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.parallel_config.worker_cls = "auto" + + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + + self.assertEqual( + vllm_config.parallel_config.worker_cls, + "vllm_ascend.worker.worker_v1.NPUWorker", + ) + + test_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() + test_ascend_config.torchair_graph_config.enabled = True + mock_init_ascend.return_value = test_ascend_config + vllm_config.parallel_config.worker_cls = "auto" + self.platform.check_and_update_config(vllm_config) + self.assertEqual( + vllm_config.parallel_config.worker_cls, + "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker", + ) + + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + @patch("vllm_ascend.utils.is_310p", return_value=True) + def test_check_and_update_config_310p_no_custom_ops( + self, mock_is_310p, mock_init_ascend, mock_check_ascend): + mock_init_ascend.return_value = TestNPUPlatform.mock_vllm_ascend_config( + ) + vllm_config = TestNPUPlatform.mock_vllm_config() + vllm_config.compilation_config.custom_ops = [] + + from vllm_ascend import platform + + importlib.reload(platform) + + self.platform.check_and_update_config(vllm_config) + self.assertEqual(vllm_config.compilation_config.custom_ops, []) + + @patch("vllm_ascend.utils.is_310p", return_value=False) + @patch("vllm_ascend.ascend_config.check_ascend_config") + @patch("vllm_ascend.ascend_config.init_ascend_config") + def test_check_and_update_config_ascend_scheduler_config( + self, mock_init_ascend, mock_check_ascend, mock_is_310p): + mock_ascend_config = TestNPUPlatform.mock_vllm_ascend_config() + mock_ascend_config.ascend_scheduler_config.enabled = True + mock_init_ascend.return_value = mock_ascend_config + + vllm_config = TestNPUPlatform.mock_vllm_config() + + with patch("vllm_ascend.core.schedule_config.AscendSchedulerConfig" + ) as mock_scheduler: + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.check_and_update_config(vllm_config) + mock_scheduler.initialize_from_config.assert_called_once() + + @patch('vllm_ascend.platform.get_ascend_config') + def test_get_attn_backend_cls_use_v1_and_mla(self, mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + mock_get_ascend_config.return_value = mock_config + + result = self.platform.get_attn_backend_cls( + selected_backend="ascend", + head_size=64, + dtype="float16", + kv_cache_dtype="float16", + block_size=64, + use_v1=True, + use_mla=True, + ) + self.assertEqual(result, + "vllm_ascend.attention.mla_v1.AscendMLABackend") + + @patch('vllm_ascend.platform.get_ascend_config') + def test_get_attn_backend_cls_use_v1_mla_and_torchair( + self, mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = True + + mock_get_ascend_config.return_value = mock_config + + result = self.platform.get_attn_backend_cls( + selected_backend="ascend", + head_size=64, + dtype="float16", + kv_cache_dtype="float16", + block_size=64, + use_v1=True, + use_mla=True, + ) + self.assertEqual( + result, + "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend") + + @patch('vllm_ascend.platform.get_ascend_config') + def test_get_attn_backend_cls_use_v1_and_torchair(self, + mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = True + + mock_get_ascend_config.return_value = mock_config + + result = self.platform.get_attn_backend_cls( + selected_backend="ascend", + head_size=64, + dtype="float16", + kv_cache_dtype="float16", + block_size=64, + use_v1=True, + use_mla=False, + ) + self.assertEqual( + result, + "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend" + ) + + @patch('vllm_ascend.platform.get_ascend_config') + def test_get_attn_backend_cls_use_v1_only(self, mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + + mock_get_ascend_config.return_value = mock_config + + result = self.platform.get_attn_backend_cls( + selected_backend="ascend", + head_size=64, + dtype="float16", + kv_cache_dtype="float16", + block_size=64, + use_v1=True, + use_mla=False, + ) + self.assertEqual( + result, + "vllm_ascend.attention.attention_v1.AscendAttentionBackend") + + def test_get_punica_wrapper(self): + result = self.platform.get_punica_wrapper() + self.assertEqual( + result, + "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU") + + @patch("torch.npu.reset_peak_memory_stats") + @patch("torch.npu.max_memory_allocated") + def test_get_current_memory_usage_with_specific_device( + self, mock_max_memory, mock_reset_stats): + max_memory_allocated_result = 1024.0 + mock_max_memory.return_value = max_memory_allocated_result + test_device = torch.device("npu:0") + result = self.platform.get_current_memory_usage(device=test_device) + + mock_reset_stats.assert_called_once_with(test_device) + mock_max_memory.assert_called_once_with(test_device) + self.assertEqual(result, max_memory_allocated_result) + + @patch("torch.npu.reset_peak_memory_stats") + @patch("torch.npu.max_memory_allocated") + def test_get_current_memory_usage_with_default_device( + self, mock_max_memory, mock_reset_stats): + max_memory_allocated_result = 1024.0 + mock_max_memory.return_value = max_memory_allocated_result + + result = self.platform.get_current_memory_usage() + + mock_reset_stats.assert_called_once_with(None) + mock_max_memory.assert_called_once_with(None) + self.assertEqual(result, max_memory_allocated_result) + + @patch("torch.npu.reset_peak_memory_stats", + side_effect=RuntimeError("Device error")) + @patch("torch.npu.max_memory_allocated") + def test_get_current_memory_usage_when_reset_stats_fails( + self, mock_max_memory, mock_reset_stats): + with self.assertRaises(RuntimeError): + self.platform.get_current_memory_usage() + mock_reset_stats.assert_called_once() + mock_max_memory.assert_not_called() + + @patch("torch.npu.reset_peak_memory_stats") + @patch( + "torch.npu.max_memory_allocated", + side_effect=RuntimeError("Memory query failed"), + ) + def test_get_current_memory_usage_when_query_fails(self, mock_max_memory, + mock_reset_stats): + with self.assertRaises(RuntimeError): + self.platform.get_current_memory_usage() + mock_reset_stats.assert_called_once() + mock_max_memory.assert_called_once() + + def test_get_device_communicator_cls_returns_correct_value(self): + self.assertEqual( + self.platform.get_device_communicator_cls(), + "vllm_ascend.distributed.communicator.NPUCommunicator", + ) + + def test_is_pin_memory_available_returns_true(self): + self.assertTrue(self.platform.is_pin_memory_available()) + + def test_supports_v1(self): + from vllm.config import ModelConfig + + mock_config = MagicMock(spec=ModelConfig) + self.assertTrue(self.platform.supports_v1(mock_config)) + + def test_get_static_graph_wrapper_cls_returns_correct_value(self): + self.assertEqual( + self.platform.get_static_graph_wrapper_cls(), + "vllm_ascend.compilation.acl_graph.ACLGraphWrapper", + ) + + @patch("torch.distributed.is_hccl_available", return_value=True) + @patch("torch_npu._C._distributed_c10d.ProcessGroupHCCL") + @patch("torch.distributed.ProcessGroup") + def test_successful_initialization(self, mock_pg, mock_pg_hccl, _): + mock_prefix = MagicMock(spec=PrefixStore) + mock_backend = MagicMock() + mock_pg_hccl.return_value = mock_backend + group_rank = 0 + group_size = 4 + + mock_pg_instance = MagicMock(spec=ProcessGroup) + mock_pg.return_value = mock_pg_instance + + # Use importlib.reload() to force-reload the platform module and ensure the mocked ProcessGroup is used. + # Without this reload, when executing self.platform.stateless_init_device_torch_dist_pg(), + # it would invoke the original unmocked ProcessGroup implementation instead of our test mock, + # which would cause the unit test to fail. + from vllm_ascend import platform + + importlib.reload(platform) + + result = self.platform.stateless_init_device_torch_dist_pg( + backend="hccl", + prefix_store=mock_prefix, + group_rank=group_rank, + group_size=group_size, + timeout=timedelta(seconds=30), + ) + + mock_pg.assert_called_once_with(mock_prefix, group_rank, group_size) + mock_pg_hccl.assert_called_once_with(mock_prefix, group_rank, + group_size, unittest.mock.ANY) + mock_backend._set_sequence_number_for_group.assert_called_once() + mock_pg_instance._register_backend.assert_called_once_with( + torch.device("npu"), unittest.mock.ANY, mock_backend) + self.assertEqual(result, mock_pg_instance) + + @patch("torch.distributed.is_hccl_available", return_value=False) + def test_hccl_unavailable(self, _): + with self.assertRaises(AssertionError): + from vllm_ascend import platform + + importlib.reload(platform) + self.platform.stateless_init_device_torch_dist_pg( + backend="hccl", + prefix_store=MagicMock(), + group_rank=0, + group_size=4, + timeout=timedelta(seconds=30), + ) diff --git a/tests/ut/test_utils.py b/tests/ut/test_utils.py new file mode 100644 index 0000000..0d264c7 --- /dev/null +++ b/tests/ut/test_utils.py @@ -0,0 +1,351 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import math +import os +from threading import Lock +from unittest import mock + +import torch +from vllm.config import (CompilationConfig, ModelConfig, ParallelConfig, + VllmConfig) + +from tests.ut.base import TestBase +from vllm_ascend import utils + + +class TestUtils(TestBase): + + def test_is_310p(self): + utils._IS_310P = None + with mock.patch("vllm_ascend._build_info.__soc_version__", + "Ascend310P3"): + self.assertTrue(utils.is_310p()) + utils._IS_310P = None + with mock.patch("vllm_ascend._build_info.__soc_version__", + "Ascend910P1"): + self.assertFalse(utils.is_310p()) + + def test_sleep_mode_enabled(self): + utils._SLEEP_MODE_ENABLED = None + with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__", + True): + self.assertTrue(utils.sleep_mode_enabled()) + utils._SLEEP_MODE_ENABLED = None + with mock.patch("vllm_ascend._build_info.__sleep_mode_enabled__", + False): + self.assertFalse(utils.sleep_mode_enabled()) + + def test_nd_to_nz_2d(self): + # can be divided by 16 + input_tensor = torch.randn(32, 64) + output = utils.nd_to_nz_2d(input_tensor) + self.assertEqual(output.shape[0], 1) + self.assertEqual(output.shape[1], 64 // 16) + self.assertEqual(output.shape[2], 32) + self.assertEqual(output.shape[3], 16) + + # cannot be divided by 16 + input_tensor = torch.randn(30, 62) + output = utils.nd_to_nz_2d(input_tensor) + self.assertEqual(output.shape[0], 1) + self.assertEqual(output.shape[1], math.ceil(62 / 16)) + self.assertEqual(output.shape[2], 32) + self.assertEqual(output.shape[3], 16) + + # pad to 16 + input_tensor = torch.randn(8, 12) + output = utils.nd_to_nz_2d(input_tensor) + self.assertEqual(output.shape[0], 1) + self.assertEqual(output.shape[1], 1) # 12->16, 16//16=1 + self.assertEqual(output.shape[2], 16) # 8->16 + self.assertEqual(output.shape[3], 16) + + # check if the output is contiguous + input_tensor = torch.randn(32, 64) + output = utils.nd_to_nz_2d(input_tensor) + self.assertTrue(output.is_contiguous()) + + # check if the output values are preserved + input_tensor = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]]) + output = utils.nd_to_nz_2d(input_tensor) + expected = torch.tensor( + [[[[1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]]) + self.assertTrue(torch.allclose(output, expected)) + + def test_aligned_16(self): + # align to 16 + input_tensor = torch.randn(15, 64) + output_tensor = utils.aligned_16(input_tensor) + self.assertEqual(output_tensor.shape[0], 16) + + # align to 16 + input_tensor = torch.randn(16, 64) + output_tensor = utils.aligned_16(input_tensor) + self.assertEqual(output_tensor.shape[0], 16) + self.assertTrue(torch.equal(input_tensor, output_tensor)) + + # align to 32 + input_tensor = torch.randn(17, 64) + output_tensor = utils.aligned_16(input_tensor) + self.assertEqual(output_tensor.shape[0], 32) + + @mock.patch('importlib.util.find_spec') + @mock.patch('importlib.import_module') + def test_try_register_lib(self, mock_import_module, mock_find_spec): + # import OK + mock_find_spec.return_value = mock.MagicMock() + mock_import_module.return_value = mock.MagicMock() + lib_name = "existing_lib" + lib_info = "Library found and imported successfully" + utils.try_register_lib(lib_name, lib_info) + + # Can't find lib + mock_find_spec.return_value = None + lib_name = "non_existing_lib" + utils.try_register_lib(lib_name) + + # import error + mock_find_spec.return_value = mock.MagicMock() + mock_import_module.side_effect = ImportError("import error") + lib_name = "error_lib" + utils.try_register_lib(lib_name) + + def test_enable_custom_op(self): + result = utils.enable_custom_op() + self.assertTrue(result) + + utils._CUSTOM_OP_ENABLED = None + + with mock.patch('builtins.__import__') as mock_import_module: + mock_import_module.side_effect = ImportError("import error") + self.assertFalse(utils.enable_custom_op()) + + def test_find_hccl_library(self): + with mock.patch.dict(os.environ, + {"HCCL_SO_PATH": "/path/to/hccl/libhccl.so"}): + self.assertEqual(utils.find_hccl_library(), + "/path/to/hccl/libhccl.so") + with mock.patch("torch.version.cann", None): + self.assertRaises(ValueError, utils.find_hccl_library) + with mock.patch("torch.version.cann", "Ascend910"): + self.assertEqual(utils.find_hccl_library(), "libhccl.so") + + def test_current_stream(self): + with mock.patch("torch.npu.current_stream") as mock_current_stream: + self.assertEqual(utils.current_stream(), mock_current_stream()) + + def test_vllm_version_is(self): + with mock.patch.dict(os.environ, {"VLLM_VERSION": "1.0.0"}): + with mock.patch("vllm.__version__", "1.0.0"): + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) + with mock.patch("vllm.__version__", "2.0.0"): + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) + with mock.patch("vllm.__version__", "1.0.0"): + self.assertTrue(utils.vllm_version_is.__wrapped__("1.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("2.0.0")) + with mock.patch("vllm.__version__", "2.0.0"): + self.assertTrue(utils.vllm_version_is.__wrapped__("2.0.0")) + self.assertFalse(utils.vllm_version_is.__wrapped__("1.0.0")) + # Test caching takes effect + utils.vllm_version_is.cache_clear() + utils.vllm_version_is("1.0.0") + misses = utils.vllm_version_is.cache_info().misses + hits = utils.vllm_version_is.cache_info().hits + self.assertEqual(misses, 1) + self.assertEqual(hits, 0) + utils.vllm_version_is("1.0.0") + hits = utils.vllm_version_is.cache_info().hits + self.assertEqual(hits, 1) + + def test_get_max_hidden_layers(self): + from transformers import PretrainedConfig + + class SimpleConfig(PretrainedConfig): + + def __init__(self, num_hidden_layers=12): + self.num_hidden_layers = num_hidden_layers + + def to_dict(self): + return {"num_hidden_layers": self.num_hidden_layers} + + self.assertEqual(utils.get_max_hidden_layers(SimpleConfig()), 12) + self.assertEqual(utils.get_max_hidden_layers(SimpleConfig(24)), 24) + + class NestedConfig(PretrainedConfig): + + def to_dict(self): + return { + "model": { + "encoder": { + "num_hidden_layers": 8 + }, + "decoder": { + "num_hidden_layers": 12 + } + }, + "other_setting": True + } + + self.assertEqual(utils.get_max_hidden_layers(NestedConfig()), 12) + + class MultiValueConfig(PretrainedConfig): + + def to_dict(self): + return { + "num_hidden_layers": 6, + "submodule": { + "num_hidden_layers": 18, + "subsub": { + "num_hidden_layers": 9 + } + } + } + + self.assertEqual(utils.get_max_hidden_layers(MultiValueConfig()), 18) + + class NoLayerConfig(PretrainedConfig): + + def to_dict(self): + return {"attention_heads": 8} + + with self.assertRaises(ValueError) as context: + utils.get_max_hidden_layers(NoLayerConfig()) + self.assertIn("num_hidden_layers", str(context.exception)) + + def test_update_aclgraph_sizes(self): + # max_num_batch_sizes < len(original_sizes) + test_compilation_config = CompilationConfig( + cudagraph_capture_sizes=[i for i in range(150)]) + model_path = os.path.join(os.path.dirname(__file__), "fake_weight") + test_model_config = ModelConfig(model=model_path, enforce_eager=True) + test_parallel_config = ParallelConfig() + test_vllm_config = VllmConfig( + model_config=test_model_config, + compilation_config=test_compilation_config, + parallel_config=test_parallel_config, + ) + utils.update_aclgraph_sizes(test_vllm_config) + os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' + utils.update_aclgraph_sizes(test_vllm_config) + del os.environ['HCCL_OP_EXPANSION_MODE'] + self.assertEqual( + 147, + len(test_vllm_config.compilation_config.cudagraph_capture_sizes)) + # max_num_batch_sizes >= len(original_sizes) + test_compilation_config = CompilationConfig( + cudagraph_capture_sizes=[1, 2, 3]) + test_vllm_config = VllmConfig( + model_config=test_model_config, + compilation_config=test_compilation_config, + parallel_config=test_parallel_config, + ) + utils.update_aclgraph_sizes(test_vllm_config) + os.environ['HCCL_OP_EXPANSION_MODE'] = 'AIV' + utils.update_aclgraph_sizes(test_vllm_config) + del os.environ['HCCL_OP_EXPANSION_MODE'] + self.assertEqual( + 3, + len(test_vllm_config.compilation_config.cudagraph_capture_sizes)) + + @mock.patch("vllm.model_executor.custom_op.CustomOp") + @mock.patch("vllm_ascend.ops.activation.AscendQuickGELU") + @mock.patch("vllm_ascend.ops.activation.AscendSiluAndMul") + @mock.patch("vllm_ascend.ops.layernorm.AscendRMSNorm") + def test_register_ascend_customop(self, mock_ascend_rmsnorm, + mock_ascend_silu_and_mul, + mock_ascend_quick_gelu, mock_customop): + utils._ASCEND_CUSTOMOP_IS_REIGISTERED = False + + # ascend custom op is not registered + utils.register_ascend_customop() + # should call register_oot three + self.assertEqual(mock_customop.register_oot.call_count, 12) + self.assertTrue(utils._ASCEND_CUSTOMOP_IS_REIGISTERED) + + # ascend custom op is already registered + utils.register_ascend_customop() + # should not register_oot again, thus only called three in this ut + self.assertEqual(mock_customop.register_oot.call_count, 12) + + +class TestProfileExecuteDuration(TestBase): + + def setUp(self): + utils.ProfileExecuteDuration._instance = None + utils.ProfileExecuteDuration._observations = [] + utils.ProfileExecuteDuration._lock = Lock() + + def test_singleton_creation(self): + instance1 = utils.ProfileExecuteDuration() + self.assertIsNotNone(instance1) + self.assertIs(instance1, utils.ProfileExecuteDuration._instance) + + instance2 = utils.ProfileExecuteDuration() + self.assertIs(instance1, instance2) + + def test_thread_safety(self): + from threading import Thread + + instances = [] + + def create_instance(): + instances.append(utils.ProfileExecuteDuration()) + + threads = [Thread(target=create_instance) for _ in range(10)] + for t in threads: + t.start() + for t in threads: + t.join() + + first_instance = instances[0] + for instance in instances[1:]: + self.assertIs(first_instance, instance) + + def test_atexit_registration(self): + with mock.patch('atexit.register') as mock_register: + instance = utils.ProfileExecuteDuration() + mock_register.assert_called_once_with(instance.destroy) + + def test_lock_usage(self): + original_lock = utils.ProfileExecuteDuration._lock + + with mock.patch.object(utils.ProfileExecuteDuration, + '_lock', + wraps=original_lock) as mock_lock: + utils.ProfileExecuteDuration() + mock_lock.__enter__.assert_called() + mock_lock.__exit__.assert_called() + + def test_observations_initialization(self): + instance = utils.ProfileExecuteDuration() + self.assertEqual(instance._observations, []) diff --git a/tests/ut/torchair/__init__.py b/tests/ut/torchair/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/ut/torchair/models/test_torchair_deepseek_mtp.py b/tests/ut/torchair/models/test_torchair_deepseek_mtp.py new file mode 100644 index 0000000..7aafdfc --- /dev/null +++ b/tests/ut/torchair/models/test_torchair_deepseek_mtp.py @@ -0,0 +1,195 @@ +import pytest +import torch +from pytest_mock import MockerFixture +from transformers import PretrainedConfig +from vllm.config import CacheConfig, ModelConfig, VllmConfig + +from tests.ut.base import PytestBase +from vllm_ascend.torchair.models.torchair_deepseek_mtp import ( + TorchairDeepSeekMTP, TorchairDeepSeekMultiTokenPredictor, + TorchairDeepSeekMultiTokenPredictorLayer) + + +class TestTorchairDeepSeekMultiTokenPredictorLayer(PytestBase): + + @pytest.fixture + def setup_mtp_layer(self, mocker: MockerFixture): + config = PretrainedConfig(vocab_size=1000, + hidden_size=768, + rms_norm_eps=1e-5) + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm.model_executor.layers.layernorm.RMSNorm.__init__", + return_value=None) + mocker.patch( + "vllm.model_executor.models.deepseek_mtp.SharedHead.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekShareHead.__init__", + return_value=None) + mocker_deepseek_v2_decode_layer = mocker.patch( + "vllm_ascend.torchair.models.torchair_deepseek_v2.TorchairDeepseekV2DecoderLayer.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + mtp_layer = TorchairDeepSeekMultiTokenPredictorLayer(config, "", None) + mocker_deepseek_v2_decode_layer.assert_called_once() + return mtp_layer + + def test_init(self, mocker: MockerFixture, setup_mtp_layer): + mtp_layer = setup_mtp_layer + assert isinstance(mtp_layer, TorchairDeepSeekMultiTokenPredictorLayer) + + def test_forward(self, mocker: MockerFixture, setup_mtp_layer): + mtp_layer = setup_mtp_layer + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch.object(mtp_layer, + 'eh_proj', + return_value=torch.randn(2, 3, 768)) + mocker.patch("torch.cat", return_value=torch.randn(2, 3, 768)) + mtp_layer.mtp_block.return_value = (torch.randn(2, 3, 768), + torch.randn(2, 3, 768)) + + input_ids = torch.tensor([[1, 2, 3], [4, 5, 6]]) + positions = torch.tensor([[0, 1, 2], [0, 1, 2]]) + kv_cache = torch.randn(2, 3, 768) + previous_hidden_states = torch.randn(2, 3, 768) + inputs_embeds = torch.tensor([[1.0, 2.0, 3.0]]) + + output = mtp_layer(input_ids, positions, kv_cache, None, + previous_hidden_states, inputs_embeds, 0) + assert output.shape == (2, 3, 768) + + +class TestTorchairDeepSeekMultiTokenPredictor(PytestBase): + + @pytest.fixture + def setup_predictor(self, mocker: MockerFixture): + mock_vllm_config = mocker.MagicMock(spec=VllmConfig) + mock_model_config = mocker.MagicMock(spec=ModelConfig) + mock_hf_config = mocker.MagicMock() + mock_hf_config.num_hidden_layers = 12 + mock_hf_config.num_nextn_predict_layers = 3 + mock_hf_config.vocab_size = 30000 + mock_model_config.hf_config = mock_hf_config + mock_vllm_config.model_config = mock_model_config + mock_vllm_config.cache_config = CacheConfig() + mock_vllm_config.quant_config = mocker.MagicMock() + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + predictor = TorchairDeepSeekMultiTokenPredictor( + vllm_config=mock_vllm_config) + return predictor + + def test_init(self, mocker: MockerFixture, setup_predictor): + predictor = setup_predictor + assert predictor.num_mtp_layers == 3 + assert isinstance(predictor, TorchairDeepSeekMultiTokenPredictor) + + @pytest.mark.parametrize( + 'kv_caches, inputs_embeds', + [(torch.tensor([[[0.1, 0.2, 0.3]]]), torch.tensor([[0.1, 0.2, 0.3]]))]) + def test_forward(self, mocker: MockerFixture, setup_predictor, kv_caches, + inputs_embeds): + predictor = setup_predictor + mock_layer = mocker.MagicMock() + mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) + predictor.layers_list = [mock_layer] + + # todo: need or not? + # predictor.num_mtp_layers = 1 + input_ids = torch.tensor([[1, 2, 3]]) + positions = torch.tensor([[0, 1, 2]]) + mocker.patch( + "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__", + return_value=torch.tensor([[1.0, 2.0, 3.0]])) + output = predictor.forward(input_ids, positions, kv_caches, None, None, + inputs_embeds, 0) + mock_layer.assert_called_once() + assert torch.allclose(output, torch.tensor([1.0, 2.0, 3.0])) + + def test_compute_logits(self, mocker: MockerFixture, setup_predictor): + hidden_states = torch.tensor([[1, 2, 3], [4, 5, 6]]) + predictor = setup_predictor + + mock_layer = mocker.MagicMock() + mock_layer.return_value = torch.tensor([1.0, 2.0, 3.0]) + predictor.layers_list = [mock_layer] + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "vllm.model_executor.layers.logits_processor.LogitsProcessor.__init__", + return_value=None) + predictor.logits_processor.return_value = torch.tensor([1.0, 2.0, 3.0]) + + result_logits = predictor.compute_logits(hidden_states=hidden_states, + sampling_metadata=None) + predictor.logits_processor.assert_called_once() + assert torch.allclose(result_logits, torch.tensor([1.0, 2.0, 3.0])) + + +class TestTorchairDeepSeekMTP(PytestBase): + + @pytest.fixture + def setup_mtp(self, mocker: MockerFixture): + vllm_config = mocker.MagicMock() + vllm_config.model_config.hf_config.num_hidden_layers = 12 + vllm_config.model_config.hf_config.num_nextn_predict_layers = 3 + vllm_config.cache_config = mocker.MagicMock() + vllm_config.quant_config = mocker.MagicMock() + + mocker.patch("torch.nn.Module.__setattr__") + mocker.patch("torch.nn.Module.__getattr__") + mocker.patch("torch.nn.Module.__delattr__") + mocker.patch( + "vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__", + return_value=None) + mocker.patch( + "vllm_ascend.torchair.models.torchair_deepseek_mtp.TorchairDeepSeekMultiTokenPredictorLayer.__call__", + return_value=None) + mocker.patch("vllm.model_executor.layers.sampler.get_sampler", + return_value=None) + mocker.patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__", + return_value=None) + mocker.patch("vllm_ascend.utils.get_ascend_config", + return_value=mocker.Mock()) + + mtp = TorchairDeepSeekMTP(vllm_config=vllm_config) + return mtp + + def test_init(self, mocker: MockerFixture, setup_mtp): + mtp = setup_mtp + assert isinstance(mtp, TorchairDeepSeekMTP) + + def test_forward(self, mocker: MockerFixture, setup_mtp): + input_ids = torch.tensor([[1, 2, 3]]) + positions = torch.tensor([[0, 1, 2]]) + kv_caches = [torch.tensor([[0.1, 0.2, 0.3]])] + previous_hidden_states = torch.tensor([[0.1, 0.2, 0.3]]) + inputs_embeds = torch.tensor([[0.1, 0.2, 0.3]]) + spec_step_idx = 0 + setup_mtp.model.return_value = torch.tensor([[1.0, 2.0, 3.0]]) + + output = setup_mtp.forward(input_ids, positions, kv_caches, None, + previous_hidden_states, inputs_embeds, + spec_step_idx) + assert torch.allclose(output, torch.tensor([[1.0, 2.0, 3.0]])) diff --git a/tests/ut/torchair/models/test_torchair_deepseek_v2.py b/tests/ut/torchair/models/test_torchair_deepseek_v2.py new file mode 100644 index 0000000..e72d023 --- /dev/null +++ b/tests/ut/torchair/models/test_torchair_deepseek_v2.py @@ -0,0 +1,325 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from types import SimpleNamespace +from unittest.mock import Mock, patch + +import pytest +import torch +from transformers import PretrainedConfig +from vllm.config import CacheConfig +from vllm.distributed.parallel_state import GroupCoordinator + +from vllm_ascend.torchair.models.torchair_deepseek_v2 import ( + TorchairDeepseekV2DecoderLayer, TorchairDeepseekV2ForCausalLM, + TorchairDeepseekV2MergedReplicatedLinear, TorchairDeepseekV2MLAAttention, + TorchairDeepseekV2MLP, TorchairDeepseekV2MoE, + TorchairDeepseekV2RowParallelLinear, + TorchairDeepseekV2RowParallelLinearReplaceAllreduce, + TorchairDeepseekV2SiluAndMul) + + +@pytest.fixture +def base_config(): + config = PretrainedConfig( + hidden_size=128, + num_attention_heads=8, + num_hidden_layers=2, + intermediate_size=256, + hidden_act="silu", + rms_norm_eps=1e-6, + rope_theta=10000.0, + max_position_embeddings=2048, + n_routed_experts=4, + n_shared_experts=1, + moe_intermediate_size=256, + num_experts_per_tok=2, + routed_scaling_factor=1.0, + first_k_dense_replace=0, + moe_layer_freq=1, + kv_lora_rank=16, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + topk_method="noaux_tc", + scoring_func="softmax", + norm_topk_prob=True, + n_group=1, + topk_group=1, + vocab_size=10000, + ) + return config + + +@pytest.fixture +def vllm_config(base_config): + model_config = SimpleNamespace( + hf_config=base_config, + tensor_parallel_size=1, + dtype=torch.float32, + use_mla=False, + quant_config=None, + max_model_len=2048, + ) + + cache_config = CacheConfig() + vllm_config = Mock() + vllm_config.model_config = model_config + vllm_config.cache_config = cache_config + vllm_config.quant_config = None + return vllm_config + + +@pytest.fixture +def mock_distributed(): + tp_group = Mock(spec=GroupCoordinator) + tp_group.rank_in_group = 0 + tp_group.world_size = 1 + tp_group.device_group = Mock() + + dp_group = Mock(spec=GroupCoordinator) + dp_group.rank_in_group = 0 + dp_group.world_size = 1 + + ep_group = Mock(spec=GroupCoordinator) + ep_group.rank_in_group = 0 + ep_group.world_size = 1 + + pp_group = Mock(spec=GroupCoordinator) + pp_group.rank_in_group = 0 + pp_group.world_size = 1 + + mock_vllm_config = Mock() + mock_vllm_config.scheduler_config = Mock(max_num_seqs=256) + mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None) + + with patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_tp_group", return_value=tp_group), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_ep_group", return_value=ep_group), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_dp_group", return_value=dp_group), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_pp_group", return_value=pp_group), \ + patch("vllm_ascend.torchair.models.torchair_deepseek_v2.get_pp_group", + return_value=Mock(is_first_rank=False, is_last_rank=False)), \ + patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \ + patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group, + _PP=pp_group), \ + patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group): + yield + + +@pytest.fixture +def mock_forward_context(): + forward_context = Mock(in_profile_run=False, with_prefill=False) + with patch( + "vllm_ascend.torchair.models.torchair_deepseek_v2.get_forward_context", + return_value=forward_context): + yield + + +def test_torchair_deepseek_v2_silu_and_mul(): + torch.set_default_device("cpu") + + silu = TorchairDeepseekV2SiluAndMul() + assert silu.weight_scale is None + + x = torch.randn(2, 4) + output = silu.forward_oot(x) + assert output.shape == (2, 2) + + weight_scale = Mock(return_value=torch.tensor(0.1)) + silu = TorchairDeepseekV2SiluAndMul(weight_scale=weight_scale) + quant_x = torch.randint(-128, 127, (2, 4), dtype=torch.int32) + dynamic_scale = torch.randn(2, 1) + with patch("torch_npu.npu_dequant_swiglu_quant", + return_value=torch.randn(2, 4)): + output = silu.forward_oot((quant_x, dynamic_scale)) + assert output.shape == (2, 4) + + +def test_torchair_deepseek_v2_merged_replicated_linear(mock_distributed): + linear = TorchairDeepseekV2MergedReplicatedLinear(input_size=128, + output_sizes=[64, 64], + bias=False, + quant_config=None) + assert linear.output_sizes == [64, 64] + + param = Mock() + param.data = torch.zeros(128, 128) + param.output_dim = 1 + param.is_gguf_weight = False + param.is_gguf_weight_type = False + loaded_weight = torch.randn(128, 64) + linear.weight_loader(param, loaded_weight, loaded_shard_id=0) + + with pytest.raises(AssertionError): + linear.weight_loader(param, torch.randn(128, 32), loaded_shard_id=0) + + +@pytest.mark.parametrize("cls", [ + TorchairDeepseekV2RowParallelLinearReplaceAllreduce, + TorchairDeepseekV2RowParallelLinear +]) +def test_row_parallel_linear(cls, mock_distributed): + linear = cls(input_size=128, output_size=64, bias=False, quant_config=None) + linear.quant_method = Mock() + linear.quant_method.apply.return_value = torch.randn(2, 4, 64) + + input_ = torch.randn(2, 4, 128) + with patch( + "vllm_ascend.torchair.models.torchair_deepseek_v2.split_tensor_along_last_dim", + return_value=[torch.randn(2, 4, 64)]): + linear.input_is_parallel = False + output = linear(input_, is_prefill=True) + assert output[0].shape == (2, 4, 64) + + linear.input_is_parallel = True + output = linear(input_, is_prefill=False) + assert output[0].shape == (2, 4, 64) + + +def test_torchair_deepseek_v2_mlp(mock_distributed, base_config): + mlp = TorchairDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=None) + assert isinstance(mlp.act_fn, TorchairDeepseekV2SiluAndMul) + + x = torch.randn(2, 4, 128) + output = mlp(x) + assert output.shape == (2, 4, 128) + + with patch( + "vllm_ascend.torchair.models.torchair_deepseek_v2.QuantizationConfig" + ) as mock_quant_config: + mock_quant_config.name = "w8a8dynamic" + with pytest.raises(NotImplementedError): + TorchairDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="silu", + quant_config=mock_quant_config, + force_replicate=False) + with pytest.raises(ValueError): + TorchairDeepseekV2MLP(hidden_size=128, + intermediate_size=256, + hidden_act="relu", + quant_config=None) + + +def test_torchair_deepseek_v2_moe(mock_distributed, base_config, + mock_forward_context): + base_config.n_shared_experts = 1 + moe = TorchairDeepseekV2MoE(config=base_config, + quant_config=None, + prefix="mlp") + assert moe.top_k == 2 + + x = torch.randn(2, 4, 128) + attn_metadata = Mock(num_prefills=1) + with patch( + "vllm_ascend.torchair.ops.torchair_fused_moe.TorchairAscendFusedMoE.__call__", + return_value=(torch.randn(2, 4, 128), torch.randn(2, 4, 128))): + output = moe(x, attn_metadata) + assert output.shape == (2, 4, 128) + + +@patch("torch_npu.npu_rms_norm") +def test_torchair_deepseek_v2_mla_attention(mock_rms_norm, mock_distributed, + base_config): + mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128)) + + attn = TorchairDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=16, + kv_lora_rank=16, + cache_config=CacheConfig(), + quant_config=None, + prefix="layers.0.self_attn") + assert attn.debug_layer_idx == 0 + + x = torch.randn(2, 4, 128) + positions = torch.arange(4).repeat(2, 1) + with patch.object(attn.mla_attn, + "__call__", + return_value=torch.randn(2, 4, 128)): + with pytest.raises(AssertionError): + attn(positions, x) + + attn = TorchairDeepseekV2MLAAttention(config=base_config, + hidden_size=128, + num_heads=8, + qk_nope_head_dim=16, + qk_rope_head_dim=16, + v_head_dim=32, + q_lora_rank=None, + kv_lora_rank=16, + prefix="layers.1.self_attn") + assert hasattr(attn, "q_proj") + + +@patch("torch_npu.npu_add_rms_norm") +@patch("torch_npu.npu_rms_norm") +def test_torchair_deepseek_v2_decoder_layer(mock_rms_norm, mock_add_norm, + mock_distributed, base_config, + vllm_config): + mock_rms_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128)) + mock_add_norm.return_value = (torch.randn(2, 128), torch.randn(2, 128), + torch.randn(2, 128)) + base_config.n_routed_experts = 4 + layer = TorchairDeepseekV2DecoderLayer( + config=base_config, + prefix="layers.0", + model_config=vllm_config.model_config, + cache_config=CacheConfig(), + quant_config=None) + assert isinstance(layer.mlp, TorchairDeepseekV2MoE) + + x = torch.randn(2, 4, 128) + positions = torch.arange(4).repeat(2, 1) + + with patch.object(layer.self_attn, "forward", Mock(return_value=torch.randn(2, 4, 128))), \ + patch.object(layer.mlp, "forward", Mock(return_value=torch.randn(2, 4, 128))): + hidden_states, residual = layer(positions, x, None) + assert hidden_states.shape == (2, 4, 128) + + base_config.n_routed_experts = None + layer = TorchairDeepseekV2DecoderLayer( + config=base_config, + prefix="layers.0", + model_config=vllm_config.model_config, + quant_config=None) + assert isinstance(layer.mlp, TorchairDeepseekV2MLP) + + +def test_torchair_deepseek_v2_for_causal_lm(mock_distributed, vllm_config): + model = TorchairDeepseekV2ForCausalLM(vllm_config=vllm_config) + + input_ids = torch.randint(0, 10000, (2, 4)) + positions = torch.arange(4).repeat(2, 1) + with patch.object(model.model, + "forward", + return_value=torch.randn(2, 4, 128)): + output = model(input_ids, positions) + assert output.shape == (2, 4, 128) + + weights = [("model.embed_tokens.weight", torch.randn(10000, 128))] + with patch( + "vllm.model_executor.model_loader.weight_utils.default_weight_loader" + ): + loaded = model.load_weights(weights) + assert loaded is not None diff --git a/tests/ut/torchair/ops/test_torchair_fused_moe.py b/tests/ut/torchair/ops/test_torchair_fused_moe.py new file mode 100644 index 0000000..19df5dc --- /dev/null +++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py @@ -0,0 +1,410 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from typing import List, TypedDict +from unittest.mock import MagicMock, patch + +import pytest +import torch +import torch.nn as nn +import torch_npu +from pytest_mock import MockerFixture +from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase + +from vllm_ascend.ascend_forward_context import _get_fused_moe_state +from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod +from vllm_ascend.quantization.quantizer import W8A8Quantizer +from vllm_ascend.torchair.ops.torchair_fused_moe import ( + TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod) +from vllm_ascend.utils import AscendSocVersion, adapt_patch # noqa E402 + +adapt_patch(True) + + +def mock_ep_and_mc2_group(mocker): + mock_group = mocker.MagicMock() + mock_group.rank_in_group = 0 + mock_group.rank = 0 + mock_group.world_size = 4 + mock_group.device_group = "mock_group_ep" + mock_group.all_to_all = MagicMock(return_value=torch.randn(8, 8)) + return mock_group + + +def mock_dp_and_tp_group(mocker): + mock_group = mocker.MagicMock() + mock_group.rank_in_group = 0 + mock_group.world_size = 2 + mock_group.device_group = "mock_group" + mock_group.all_gather = MagicMock(return_value=torch.randn(10, 32)) + return mock_group + + +@pytest.fixture +def mock_dist_env(mocker: MockerFixture): + # init dist env patch + + with patch('torch.distributed.get_rank', return_value=0), \ + patch('torch.distributed.get_world_size', return_value=4), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \ + patch('torch.distributed.all_gather', return_value=MagicMock(return_value=torch.randn(10,32))), \ + patch('torch.distributed.all_to_all_single', return_value=torch.randn(8, 32)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.tensor_model_parallel_all_reduce', + return_value=torch.randn(5, 32)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.data_parallel_reduce_scatter', + return_value=torch.randn(5, 32)), \ + patch('vllm.model_executor.layers.fused_moe.config.get_dp_group', + return_value=mock_dp_and_tp_group(mocker)), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config', + return_value=MagicMock( + torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False), + expert_map_path=None + )), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.determine_expert_map', + return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context', + return_value=MagicMock( + max_tokens_across_dp=10, + dp_metadata=MagicMock(cu_tokens_across_dp_cpu=[5, 10]) + )), \ + patch('vllm_ascend.torchair.ops.torchair_fused_moe.get_current_vllm_config', + return_value=MagicMock( + parallel_config=MagicMock(tensor_parallel_size=2), + scheduler_config=MagicMock(max_num_seqs=4), + model_config=MagicMock(max_model_len=2048) + )): + yield + + +@pytest.fixture +def mock_moe_env(mocker: MockerFixture): + # init moe env patch + + with patch('torch_npu.npu_moe_gating_top_k', return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + None + )), \ + patch('torch_npu.npu_moe_init_routing', return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + torch.tensor([0, 1, 2, 4, 6, 2, 7, 1]) + )), \ + patch("torch_npu.npu_moe_compute_expert_tokens", return_value=( + torch.randn(8, 2) + )), \ + patch("torch_npu.npu_moe_distribute_dispatch", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_moe_distribute_combine", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_grouped_matmul", return_value=( + [torch.randn(16, 2)] + )), \ + patch("torch_npu.npu_swiglu", return_value=( + torch.randn(16, 2) + )), \ + patch("torch_npu.npu_moe_gating_top_k_softmax", return_value=( + torch.randn(8, 2), + torch.randint(0, 8, (8, 2)), + torch.tensor([0, 1, 2, 4, 6, 2, 7, 1]) + )), \ + patch("torch_npu.npu_moe_finalize_routing", return_value=( + torch.randn(16, 2) + )): + if hasattr(torch_npu, 'npu_moe_distribute_dispatch_v2'): + with patch("torch_npu.npu_moe_distribute_dispatch_v2", return_value=( + torch.randn(16, 2))), \ + patch("torch_npu.npu_moe_distribute_combine_v2", return_value=( + torch.randn(16, 2))): + yield + else: + yield + + +@pytest.fixture +def default_moe_config(): + """default moe config""" + return { + 'num_experts': 8, + 'top_k': 2, + 'hidden_size': 512, + 'intermediate_size': 1024 + } + + +@pytest.fixture +def moe_method(mock_dist_env): + moe = MagicMock() + moe.moe_parallel_config.return_value = MagicMock(ep_size=4) + return TorchairAscendUnquantizedFusedMoEMethod(moe) + + +class Device(TypedDict): + device_id: int + device_expert: List[int] + + +class Layer(TypedDict): + layer_id: int + device_count: int + device_list: List[Device] + + +class MockData(TypedDict): + moe_layer_count: int + layer_list: List[Layer] + + +class MockQuantMethod(nn.Module): + + def __init__(self, shared_experts, num_tokens): + super().__init__() + if shared_experts: + self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32), + torch.randn(num_tokens, 10))) + else: + self.apply = MagicMock(return_value=(torch.randn(num_tokens, 32))) + + +class MockFusedMoEMethod(FusedMoEMethodBase): + moe = MagicMock() + + def __init__(self): + super().__init__(self.moe) + + def create_weights(self, layer: torch.nn.Module, num_experts: int, + hidden_size: int, intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): + pass + + def apply(self, hidden_states: torch.Tensor, + expert_weights: torch.Tensor) -> torch.Tensor: + pass + + +class TestTorchairAscendFusedMoe: + + def test_init_no_quant(self, mock_dist_env, default_moe_config): + layer = TorchairAscendFusedMoE(**default_moe_config) + + layer.w13_weight = nn.Parameter( + torch.randn(default_moe_config['num_experts'], + default_moe_config['intermediate_size'] * 2, + default_moe_config['hidden_size'])) + layer.w2_weight = nn.Parameter( + torch.randn(default_moe_config['num_experts'], + default_moe_config['hidden_size'], + default_moe_config['intermediate_size'])) + + assert layer.num_experts == default_moe_config['num_experts'] + assert layer.top_k == default_moe_config['top_k'] + assert hasattr(layer, 'w13_weight') + assert hasattr(layer, 'w2_weight') + + # check group_topk + with pytest.raises(AssertionError): + error_config = default_moe_config.copy() + error_config['use_grouped_topk'] = True + layer = TorchairAscendFusedMoE(**error_config) + + # check scoring_func + with pytest.raises(ValueError): + error_config = default_moe_config.copy() + error_config['scoring_func'] = "random" + layer = TorchairAscendFusedMoE(**error_config) + + def test_init_with_quant(self, mock_dist_env, default_moe_config): + mock_quant_config = MagicMock() + mock_quant_method = MockFusedMoEMethod() + mock_quant_config.get_quant_method.return_value = mock_quant_method + mock_quant_config.is_layer_skipped_ascend.return_value = False + with patch( + 'vllm_ascend.quantization.quantizer.AscendQuantizer.get_quantizer', + return_value=W8A8Quantizer): + moe = TorchairAscendFusedMoE(**default_moe_config, + quant_config=mock_quant_config) + + assert moe.quant_method is not None + assert isinstance(moe.quant_method, AscendFusedMoEMethod) + + def test_init_with_mixed_quant(self, mock_dist_env, default_moe_config): + mock_quant_config = MagicMock() + mock_quant_method = MockFusedMoEMethod() + mock_quant_config.get_quant_method.return_value = mock_quant_method + mock_quant_config.is_layer_skipped_ascend.return_value = True + + moe = TorchairAscendFusedMoE(**default_moe_config, + quant_config=mock_quant_config) + + assert moe.quant_method is not None + assert isinstance(moe.quant_method, + TorchairAscendUnquantizedFusedMoEMethod) + + @pytest.mark.parametrize( + "others_param", + [[None, + MagicMock(return_value=torch.randn(5, 32)), False, 5, None], + [2, None, False, 5, None], [None, None, True, 5, None], + [None, None, False, 1, None], [None, None, True, 5, 1], + [None, None, False, 5, 1]]) + def test_forward(self, mock_dist_env, default_moe_config, others_param): + """ + 1 test has shared_experts + 2 test has top_k + 3 test is_prefill is true + 4 test single num_tokens(decode) + 5 test ep_size is 1 and is_prefill is true + 6 test ep_size is 1 and is_prefill is False + """ + top_k, shared_experts, is_prefill, num_tokens, ep_size = others_param + inputs = torch.randn(num_tokens, 32) + router_logits = torch.randn(num_tokens, 8) + moe = TorchairAscendFusedMoE(**default_moe_config) + + if ep_size == 1: + moe.moe_parallel_config.ep_size = 1 + + moe.quant_method = MockQuantMethod(shared_experts, num_tokens) + forward_context = MagicMock(mc2_mask=torch.zeros(num_tokens, + dtype=torch.bool), + padded_num_tokens=num_tokens) + with patch( + "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", + return_value=forward_context): + output = moe.forward(inputs, + router_logits, + is_prefill=is_prefill, + top_k=top_k, + shared_experts=shared_experts) + + moe.quant_method.apply.assert_called_once() + + if shared_experts: + assert output[0].shape == (num_tokens, 32) + assert output[1].shape == (num_tokens, 10) + else: + assert output.shape == (num_tokens, 32) + + def test_forward_ms_fused_moe_comp(self, mock_dist_env, + default_moe_config): + inputs = torch.randn(5, 32) + router_logits = torch.randn(5, 8) + moe = TorchairAscendFusedMoE(**default_moe_config) + + moe.quant_method = MockQuantMethod(None, 5) + output = moe._forward_ms_fused_moe_comp(inputs, + router_logits, + is_prefill=False, + real_top_k=1) + + moe.quant_method.apply.assert_called_once() + + assert output.shape == (5, 32) + + +class TestTorchairAscendUnquantizedFusedMoEMethod: + + def test_process_weights_after_loading(self, moe_method, mock_dist_env): + layer = MagicMock() + layer.w13_weight.data = torch.randn(16, 32) + layer.w2_weight.data = torch.randn(16, 32) + + moe_method.process_weights_after_loading(layer) + + assert isinstance(layer.w13_weight, torch.nn.Parameter) + assert isinstance(layer.w2_weight, torch.nn.Parameter) + assert not layer.w13_weight.requires_grad + assert not layer.w2_weight.requires_grad + + @pytest.mark.parametrize("others_param", + [[256, 4], [128, 1], [128, 1], [128, 4]]) + def test_apply_without_expert_map(self, moe_method, mock_dist_env, + mock_moe_env, others_param): + """ + 1 test is_deepseek_v3_r1=true and use fused_experts_with_all2all + 2 test use_select_experts and fused_experts + 3 test use select_gating_topk_softmax_experts and fused_experts + 4 test use select_experts and fused_experts_with_all2all_buffer + """ + global_num_experts, ep_size = others_param + is_prefill = False + is_deepseek_v3_r1 = global_num_experts == 256 + forward_context = MagicMock(fused_moe_state=_get_fused_moe_state( + ep_size, is_prefill, is_deepseek_v3_r1)) + with patch( + "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", + return_value=forward_context): + moe_method.ep_size = ep_size + x = torch.randn(8, 2, 2) + router_logits = torch.randn(8, 8) + layer = MagicMock() + layer.w13_weight = torch.randn(8, 16, 1) + layer.w2_weight = torch.randn(16, 8, 1) + result = moe_method.apply(layer=layer, + x=x, + router_logits=router_logits, + top_k=2, + renormalize=True, + global_num_experts=global_num_experts, + is_prefill=is_prefill) + + if ep_size == 1: + assert result.shape == (16, 2) + else: + assert result.shape == x.shape + + @pytest.mark.parametrize("others_param", [16, 1, 4]) + def test_apply_with_expert_map(self, moe_method, mock_dist_env, + mock_moe_env, others_param): + """ + 1 test use_select_experts and use fused_expters_with_mc2 + 2 test use_select_experts and fused_experts_with_all2all_buffer + 3 test use_select_experts and fused_experts_with_all2all + 4 test use_select_experts and fused_experts + """ + ep_size = others_param + is_prefill = False + forward_context = MagicMock( + fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True)) + with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \ + patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3): + expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]) + moe_method.ep_size = ep_size + x = torch.randn(8, 2, 2) + if ep_size == 1: + x = x.view(-1, 2) + router_logits = torch.randn(8, 8) + layer = MagicMock() + layer.w13_weight = torch.randn(8, 16, 1) + layer.w2_weight = torch.randn(16, 8, 1) + result = moe_method.apply(layer=layer, + x=x, + router_logits=router_logits, + top_k=2, + renormalize=True, + global_num_experts=128, + expert_map=expert_map, + is_prefill=is_prefill) + + if ep_size == 16 or ep_size == 1: + assert result.shape == (16, 2) + else: + assert result.shape == x.shape diff --git a/tests/ut/torchair/ops/test_torchair_rotary_embedding.py b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py new file mode 100644 index 0000000..e7c68f7 --- /dev/null +++ b/tests/ut/torchair/ops/test_torchair_rotary_embedding.py @@ -0,0 +1,332 @@ +import math +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.torchair.ops.torchair_rotary_embedding import ( + custom_rotary_embedding_enabled, native_rope_deepseek_forward, + rope_forward_oot, rotate_half, yarn_find_correction_dim, yarn_get_mscale) + + +class TestCustomRotaryEmbeddingEnabled(TestBase): + + def setUp(self): + # Common setup for tests + self.positions = torch.tensor([1, 2, 3]) + self.query = torch.randn(3, 4, dtype=torch.float16) + self.key = torch.randn(3, 4, dtype=torch.float16) + self.head_size = 32 + self.cos_sin_cache = torch.randn(3, 4) + + # Mock self object for rope_forward_oot + self.mock_self = MagicMock() + self.mock_self.head_size = self.head_size + self.mock_self.cos_sin_cache = self.cos_sin_cache + self.mock_self.is_neox_style = True + self.mock_self.forward_native.return_value = (self.query, self.key) + + def test_custom_rotary_embedding_enabled(self): + # Test when all conditions are True + with patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op', + return_value=True): + result = custom_rotary_embedding_enabled(self.query, True, + self.head_size) + self.assertTrue(result) + + # Test when dtype is not float16 + with patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op', + return_value=True): + query = self.query.to(torch.float32) + result = custom_rotary_embedding_enabled(query, True, + self.head_size) + self.assertFalse(result) + + # Test when neox_style is False + with patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op', + return_value=True): + result = custom_rotary_embedding_enabled(self.query, False, + self.head_size) + self.assertFalse(result) + + # Test when head_size is not divisible by 32 + with patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op', + return_value=True): + result = custom_rotary_embedding_enabled(self.query, True, + self.head_size + 1) + self.assertFalse(result) + + # Test when custom op is disabled + with patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.enable_custom_op', + return_value=False): + result = custom_rotary_embedding_enabled(self.query, True, + self.head_size) + self.assertFalse(result) + + +class TestRopeForwardOot(TestBase): + + def setUp(self): + # Common setup for tests + self.positions = torch.tensor([1, 2, 3]) + self.query = torch.randn(3, 4, dtype=torch.float16) + self.key = torch.randn(3, 4, dtype=torch.float16) + self.head_size = 32 + self.cos_sin_cache = torch.randn(3, 4) + + # Mock self object for rope_forward_oot + self.mock_self = MagicMock() + self.mock_self.head_size = self.head_size + self.mock_self.cos_sin_cache = self.cos_sin_cache + self.mock_self.is_neox_style = True + self.mock_self.forward_native.return_value = (self.query, self.key) + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') + def test_rope_forward_oot_torchair_enabled_base(self, + mock_get_ascend_config): + # Setup mock for torchair enabled + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = True + mock_get_ascend_config.return_value = mock_config + + result_q, result_k = rope_forward_oot(self.mock_self, self.positions, + self.query, self.key) + + self.mock_self.forward_native.assert_called_once_with( + self.positions, self.query, self.key, None) + self.assertTrue(torch.equal(result_q, self.query)) + self.assertTrue(torch.equal(result_k, self.key)) + + @patch('torch.ops._C') + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') + @patch('vllm_ascend.torchair.ops.torchair_rotary_embedding.is_310p', + return_value=False) + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled', + return_value=True) + @patch('torch.ops._npu_rotary_embedding') + def test_rope_forward_oot_custom_kernel(self, mock_rotary_embedding, + mock_custom_enabled, mock_is_310p, + mock_get_ascend_config, mock__c): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + mock_get_ascend_config.return_value = mock_config + + # Setup mock for custom kernel path + + mock__c.rotary_embedding.return_value = self.query, self.key + + result_q, result_k = rope_forward_oot(self.mock_self, self.positions, + self.query, self.key) + + self.assertEqual(result_q.shape, self.query.shape) + self.assertEqual(result_k.shape, self.key.shape) + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled', + return_value=False) + @patch('torch_npu._npu_rotary_embedding') + def test_rope_forward_oot_contiguous(self, mock_npu_rotary, + mock_custom_enabled, + mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + mock_get_ascend_config.return_value = mock_config + + # Test contiguous path when custom is disabled + non_contig_query = self.query.transpose(0, 1) + non_contig_key = self.key.transpose(0, 1) + + result_q, result_k = rope_forward_oot(self.mock_self, self.positions, + non_contig_query, non_contig_key) + + mock_npu_rotary.assert_called_once() + self.assertEqual(result_q.shape, non_contig_query.shape) + self.assertEqual(result_k.shape, non_contig_key.shape) + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') + def test_rope_forward_oot_with_offsets(self, mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + mock_get_ascend_config.return_value = mock_config + + # Test that NotImplementedError is raised when offsets is provided + offsets = torch.tensor([1, 2, 3]) + with self.assertRaises(NotImplementedError): + rope_forward_oot(self.mock_self, self.positions, self.query, + self.key, offsets) + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.get_ascend_config') + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.custom_rotary_embedding_enabled', + return_value=False) + @patch('torch_npu._npu_rotary_embedding') + def test_rope_forward_oot_neox_style_override(self, mock_npu_rotary, + mock_custom_enabled, + mock_get_ascend_config): + mock_config = MagicMock() + mock_config.torchair_graph_config.enabled = False + mock_get_ascend_config.return_value = mock_config + + # Test neox_style override + result_q, result_k = rope_forward_oot(self.mock_self, + self.positions, + self.query, + self.key, + is_neox_style_override=False) + + # Check that neox_style=False was passed to the NPU function + args, kwargs = mock_npu_rotary.call_args + self.assertFalse(args[-1]) + + +class MockRopeModule: + + def __init__(self, max_seq_len=2048, is_neox_style=True): + self.max_seq_len = max_seq_len + self.is_neox_style = is_neox_style + self.cos_cached = None + self.sin_cached = None + self.rotary_dim = 1 + self.base = 1 + + +class TestNativeRopeDeepseekForward(TestBase): + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot') + def test_native_rope_deepseek_forward_base(self, mock_rope_forward_oot): + module = MockRopeModule() + positions = torch.tensor([1, 2, 3]) + query = torch.randn(1, 8, 128) + key = torch.randn(1, 8, 128) + + mock_rope_forward_oot.return_value = (query, key) + + q_pe, k_pe = native_rope_deepseek_forward(module, positions, query, + key) + + assert q_pe.shape == query.shape + assert k_pe.shape == key.shape + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding._set_cos_sin_cache' + ) + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot') + def test_native_rope_deepseek_forward_cache_handling( + self, mock_rope_forward_oot, mock_set_cache): + # Test cache situation is true + module = MockRopeModule(max_seq_len=1024) + positions = torch.tensor([1, 2, 3]) + query = torch.randn(1, 8, 128) + key = torch.randn(1, 8, 128) + + mock_rope_forward_oot.return_value = (query, key) + + q_pe, k_pe = native_rope_deepseek_forward(module, + positions, + query, + key, + max_seq_len=2048) + + assert q_pe.shape == query.shape + assert k_pe.shape == key.shape + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot') + def test_native_rope_deepseek_forward_key_reshaping( + self, mock_rope_forward_oot): + module = MockRopeModule() + positions = torch.tensor([1, 2, 3]) + query = torch.randn(1, 8, 128) + key = torch.randn(1, 128) + + mock_rope_forward_oot.return_value = (query, key) + + q_pe, k_pe = native_rope_deepseek_forward(module, positions, query, + key) + + assert q_pe.shape == query.shape + assert k_pe.shape == (1, 128) + + @patch( + 'vllm_ascend.torchair.ops.torchair_rotary_embedding.rope_forward_oot') + def test_native_rope_deepseek_forward_non_neox_style( + self, mock_rope_forward_oot): + module = MockRopeModule(is_neox_style=False) + positions = torch.tensor([1, 2, 3]) + query = torch.randn(1, 8, 128) + key = torch.randn(1, 8, 128) + + mock_rope_forward_oot.return_value = (query, key) + + q_pe, k_pe = native_rope_deepseek_forward(module, positions, query, + key) + + assert q_pe.shape == query.shape + assert k_pe.shape == key.shape + + +class TestRotateHalf(TestBase): + + def test_rotate_half_even_dim(self): + # Test with even dimension + x = torch.tensor([1.0, 2.0, 3.0, 4.0]) + expected = torch.tensor([-3.0, -4.0, 1.0, 2.0]) + result = rotate_half(x) + self.assertTrue(torch.allclose(result, expected)) + + +class TestYarnFindCorrectionDim(TestBase): + + def test_basic_case(self): + # Test with standard values + num_rotations = 100 + dim = 512 + base = 10000 + max_position_embeddings = 2048 + + result = yarn_find_correction_dim(num_rotations, dim, base, + max_position_embeddings) + + # Calculate expected value manually + expected = (dim * torch.log( + torch.tensor(max_position_embeddings) / + (num_rotations * 2 * torch.pi))) / (2 * + torch.log(torch.tensor(base))) + + self.assertTrue(torch.allclose(result, expected)) + + +class TestYarnGetMscale(TestBase): + + def test_scale_less_than_or_equal_1(self): + self.assertEqual(yarn_get_mscale(scale=0.5), 1.0) + self.assertEqual(yarn_get_mscale(scale=1.0), 1.0) + self.assertEqual(yarn_get_mscale(scale=0.999), 1.0) + + def test_scale_greater_than_1(self): + test_cases = [(2.0, 1.0, 1.0 + 0.1 * math.log(2.0)), + (10.0, 1.0, 1.0 + 0.1 * math.log(10.0)), + (5.0, 2.0, 1.0 + 0.2 * math.log(5.0)), + (math.e, 1.0, 1.0 + 0.1)] + + for scale, mscale, expected in test_cases: + result = yarn_get_mscale(scale, mscale) + self.assertAlmostEqual( + result, + expected, + places=6, + msg=f"Failed for scale={scale}, mscale={mscale}") diff --git a/tests/ut/torchair/quantization/test_torchair_w4a8_dynamic.py b/tests/ut/torchair/quantization/test_torchair_w4a8_dynamic.py new file mode 100644 index 0000000..cd94101 --- /dev/null +++ b/tests/ut/torchair/quantization/test_torchair_w4a8_dynamic.py @@ -0,0 +1,176 @@ +import copy +from unittest.mock import Mock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import ( + TorchairAscendW4A8DynamicFusedMoEMethod, + TorchairAscendW4A8DynamicLinearMethod) + + +class TestAscendW4A8DynamicLinearMethod(TestBase): + + def setUp(self): + self.method = TorchairAscendW4A8DynamicLinearMethod() + self.method.group_size = 8 + + def test_get_weight(self): + weight = self.method.get_weight(8, 32, torch.bfloat16) + self.assertEqual(weight["weight"].dtype, torch.int8) + self.assertEqual(weight["weight"].shape, (32, 8)) + + def test_get_pergroup_param(self): + params = self.method.get_pergroup_param(8, 32, torch.bfloat16) + self.assertEqual(params["weight_scale"].dtype, torch.bfloat16) + self.assertEqual(params["weight_scale"].shape, (32, 1)) + self.assertEqual(params["weight_offset"].dtype, torch.bfloat16) + self.assertEqual(params["weight_offset"].shape, (32, 1)) + self.assertEqual(params["weight_scale_second"].dtype, torch.bfloat16) + self.assertEqual(params["weight_scale_second"].shape, (32, 1)) + self.assertEqual(params["weight_offset_second"].dtype, torch.bfloat16) + self.assertEqual(params["weight_offset_second"].shape, (32, 1)) + + +class TestAscendW4A8DynamicFusedMoEMethod(TestBase): + experts = 8 + input_size = 16 + output_size = 56 + group_size = 2 + + @patch( + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_current_vllm_config' + ) + @patch( + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_ep_group') + @patch("vllm_ascend.ascend_config.get_ascend_config") + @patch( + 'vllm_ascend.torchair.quantization.torchair_w4a8_dynamic.get_mc2_group' + ) + @patch('torch.distributed.get_rank', return_value=0) + def setUp(self, mock_get_rank, mock_get_mc2_group, mock_get_ascend_config, + mock_get_ep_group, get_current_vllm_config): + mock_ascend_config = Mock() + mock_ascend_config.torchair_graph_config = Mock(enabled=False) + mock_get_ascend_config.return_value = mock_ascend_config + mock_vllm_config = Mock() + mock_vllm_config.quant_config = Mock(quant_description={ + "group_size": self.group_size, + "version": "0.0.0" + }) + mock_vllm_config.parallel_config = Mock(enable_expert_parallel=True) + get_current_vllm_config.return_value = mock_vllm_config + self.quant_method = TorchairAscendW4A8DynamicFusedMoEMethod() + + def test_get_weight(self): + # old quant version w4a8 weight + param_dict = self.quant_method.get_weight(self.experts, + self.input_size, + self.output_size, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) + self.assertEqual(param_dict["w13_weight"].shape, + (self.experts, 2 * self.input_size, self.output_size)) + # new quant version weight + self.quant_method.new_quant_version = True + param_dict = self.quant_method.get_weight(self.experts, + self.input_size, + self.output_size, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight"].dtype, torch.int8) + self.assertEqual(param_dict["w13_weight"].shape, + (self.experts, self.input_size, self.output_size)) + + def test_get_dynamic_quant_param(self): + # old quant version weight + param_dict = self.quant_method.get_dynamic_quant_param( + self.experts, self.input_size, self.output_size, torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale"].shape, + (self.experts, 2 * self.input_size, 1)) + self.assertEqual(param_dict["w13_weight_scale_second"].dtype, + torch.bfloat16) + self.assertEqual(param_dict["w13_weight_scale_second"].shape, + (self.experts, 2 * self.input_size, + self.output_size // self.group_size)) + self.assertEqual(param_dict["w2_weight_scale"].dtype, torch.bfloat16) + self.assertEqual(param_dict["w2_weight_scale"].shape, + (self.experts, self.output_size, 1)) + self.assertEqual(param_dict["w2_weight_scale_second"].dtype, + torch.bfloat16) + self.assertEqual(param_dict["w2_weight_scale_second"].shape, + (self.experts, self.output_size, + self.input_size // self.group_size)) + # new quant version weight + self.quant_method.new_quant_version = True + param_dict = self.quant_method.get_dynamic_quant_param( + self.experts, self.input_size, self.output_size, torch.bfloat16) + self.assertEqual(param_dict["w2_scale_bias"].dtype, torch.float32) + self.assertEqual( + param_dict["w2_scale_bias"].shape, + (self.experts, self.output_size, 16 // self.quant_method.tp_size)) + + @patch('torch_npu.npu_quantize') + @patch('torch.Tensor.npu') + def test_process_weights_after_loading(self, mock_npu, mock_npu_quantize): + # old quant version weight + layer = torch.nn.Module() + layer.w13_weight = torch.nn.Parameter(torch.zeros( + (self.experts, 2 * self.input_size, self.output_size), + dtype=torch.int8), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(torch.zeros( + (self.experts, self.output_size, self.input_size), + dtype=torch.int8), + requires_grad=False) + layer.w13_weight_scale = torch.nn.Parameter(torch.ones( + (self.experts, 2 * self.input_size, 1), dtype=torch.bfloat16), + requires_grad=False) + layer.w13_weight_scale_second = torch.nn.Parameter(torch.ones( + (self.experts, 2 * self.input_size, + self.output_size // self.group_size), + dtype=torch.bfloat16), + requires_grad=False) + layer.w2_weight_scale = torch.nn.Parameter(torch.ones( + (self.experts, self.output_size, 1), dtype=torch.bfloat16), + requires_grad=False) + layer.w2_weight_scale_second = torch.nn.Parameter(torch.ones( + (self.experts, self.output_size, + self.input_size // self.group_size), + dtype=torch.bfloat16), + requires_grad=False) + new_layer = copy.deepcopy(layer) + + mock_npu.return_value = torch.Tensor() + mock_npu_quantize.return_value = torch.Tensor() + self.quant_method.process_weights_after_loading(layer) + self.assertTrue(hasattr(layer, "w13_scale_bias")) + self.assertEqual(layer.w13_scale_bias.data.shape, + (self.experts, 2 * self.input_size)) + self.assertEqual(layer.w13_scale_bias.data.dtype, torch.float32) + self.assertTrue(hasattr(layer, "w2_scale_bias")) + self.assertEqual(layer.w2_scale_bias.data.shape, + (self.experts, self.output_size)) + self.assertEqual(layer.w2_scale_bias.data.dtype, torch.float32) + # new quant version weight + self.quant_method.new_quant_version = True + new_layer.w13_weight.data = torch.zeros( + (self.experts, self.input_size, self.output_size), + dtype=torch.int8) + new_layer.w2_weight.data = torch.zeros( + (self.experts, self.output_size // 2, self.input_size), + dtype=torch.int8) + w13_scale_bias = torch.zeros((self.experts, 2 * self.input_size, 1), + dtype=torch.float32) + new_layer.w13_scale_bias = torch.nn.Parameter(w13_scale_bias, + requires_grad=False) + w2_scale_bias = torch.zeros( + (self.experts, self.output_size, 16 // self.quant_method.tp_size), + dtype=torch.float32) + new_layer.w2_scale_bias = torch.nn.Parameter(w2_scale_bias, + requires_grad=False) + self.quant_method.process_weights_after_loading(new_layer) + self.assertEqual(new_layer.w13_scale_bias.data.shape, + (self.experts, 2 * self.input_size)) + self.assertEqual(new_layer.w2_scale_bias.data.shape, + (self.experts, self.output_size)) diff --git a/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py new file mode 100644 index 0000000..520155d --- /dev/null +++ b/tests/ut/torchair/quantization/test_torchair_w8a8_dynamic.py @@ -0,0 +1,75 @@ +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \ + torchair_fused_experts_with_all2all + + +class TestAscendW8A8FusedMoEMethod(TestBase): + + def setUp(self): + self.hidden_size = 128 + self.num_tokens = 128 + self.placeholder = torch.randn(self.num_tokens, + self.hidden_size, + dtype=torch.bfloat16) + + @patch("torch.distributed.all_to_all_single") + @patch("torch_npu.npu_moe_re_routing") + @patch("torch_npu.npu_grouped_matmul") + @patch("torch_npu.npu_swiglu") + @patch("torch_npu.npu_dynamic_quant") + @patch("torch_npu.npu_moe_finalize_routing") + @patch("torch_npu.npu_moe_init_routing") + def test_torchair_fused_experts_with_all2all( + self, mock_moe_init_routing, mock_moe_finalize_routing, + mock_dynamic_quant, mock_swiglu, mock_grouped_matmul, + mock_moe_re_routing, mock_all_to_all_single): + + expert_map = MagicMock() + ep_group = MagicMock() + placeholder_int8 = torch.randint(0, + 100, + (self.num_tokens, self.hidden_size), + dtype=torch.int8) + placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32) + mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_( + input) + mock_moe_init_routing.return_value = ( + placeholder_int8, + placeholder_ones, + placeholder_ones, + ) + mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder, + torch.randint(0, + 100, + (self.num_tokens, ), + dtype=torch.int32), + self.placeholder) + mock_grouped_matmul.return_value = self.placeholder + mock_swiglu.return_value = self.placeholder + mock_dynamic_quant.return_value = ( + placeholder_int8, + torch.randn(self.num_tokens), + ) + mock_moe_finalize_routing.return_value = self.placeholder + + result = torchair_fused_experts_with_all2all( + hidden_states=self.placeholder, + w1=self.placeholder, + w1_scale=self.placeholder, + w2=self.placeholder, + w2_scale=self.placeholder, + topk_weights=self.placeholder, + topk_ids=self.placeholder, + top_k=8, + expert_map=expert_map, + ep_group=ep_group, + log2phy=None, + global_redundant_expert_num=256, + ) + self.assertIsNotNone(result) + self.assertEqual(result.dtype, torch.bfloat16) + self.assertEqual(result.shape, (128, 128)) diff --git a/tests/ut/torchair/test_torchair_mla.py b/tests/ut/torchair/test_torchair_mla.py new file mode 100644 index 0000000..6ee983a --- /dev/null +++ b/tests/ut/torchair/test_torchair_mla.py @@ -0,0 +1,817 @@ +from unittest.mock import MagicMock, patch + +import torch +from torch import nn +from vllm.distributed.parallel_state import GroupCoordinator +from vllm.model_executor.layers.linear import LinearBase + +from tests.ut.base import TestBase +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.torchair.torchair_mla import ( + AscendMLATorchairBackend, AscendMLATorchairDecodeMetadata, + AscendMLATorchairImpl, AscendMLATorchairMetadata, + AscendMLATorchairMetadataBuilder, AscendMLATorchairPrefillMetadata) +from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata + + +class TestAscendMLATorchairBackend(TestBase): + + def test_get_name(self): + self.assertEqual(AscendMLATorchairBackend.get_name(), + "ASCEND_MLA_TORCHAIR") + + def test_get_metadata_cls(self): + self.assertEqual(AscendMLATorchairBackend.get_metadata_cls(), + AscendMLATorchairMetadata) + + def test_get_builder_cls(self): + self.assertEqual(AscendMLATorchairBackend.get_builder_cls(), + AscendMLATorchairMetadataBuilder) + + def test_get_kv_cache_shape(self): + result = AscendMLATorchairBackend.get_kv_cache_shape(2, 4, 8, 128) + self.assertEqual(result, (2, 4, 8, 128)) + + def test_get_impl_cls(self): + result = AscendMLATorchairBackend.get_impl_cls() + self.assertEqual(result, AscendMLATorchairImpl) + + +class TestAscendMLATorchairPrefillMetadata(TestBase): + + def test_ascend_mla_prefill_metadata_default(self): + attn_mask = torch.tensor([[1, 0], [1, 1]], dtype=torch.bool) + query_lens = [1, 2] + seq_lens = [2, 2] + context_lens = torch.tensor([1, 2]) + input_positions = torch.tensor([0, 1, 0, 1]) + query_start_loc = torch.tensor([0, 1, 3]) + block_table = torch.tensor([[0, 1], [2, 3]]) + max_query_len = 2 + max_seq_lens = 2 + + metadata = AscendMLATorchairPrefillMetadata( + attn_mask=attn_mask, + query_lens=query_lens, + seq_lens=seq_lens, + context_lens=context_lens, + input_positions=input_positions, + query_start_loc=query_start_loc, + block_table=block_table, + max_query_len=max_query_len, + max_seq_lens=max_seq_lens) + self.assertIs(metadata.attn_mask, attn_mask) + self.assertEqual(metadata.query_lens, query_lens) + self.assertEqual(metadata.seq_lens, seq_lens) + self.assertIs(metadata.context_lens, context_lens) + self.assertIs(metadata.input_positions, input_positions) + self.assertIs(metadata.query_start_loc, query_start_loc) + self.assertIs(metadata.block_table, block_table) + self.assertEqual(metadata.max_query_len, max_query_len) + self.assertEqual(metadata.max_seq_lens, max_seq_lens) + self.assertIsNone(metadata.chunked_context) + + def test_ascend_mla_prefill_metadata_with_chunked_context(self): + cu_seq_lens = torch.tensor([0, 2, 4]) + starts = torch.tensor([0, 2]) + seq_tot = [2, 2] + max_seq_lens = [2, 2] + workspace = torch.randn(2, 4) + chunk_seq_lens = torch.tensor([2, 2]) + + chunked_context = AscendMLATorchairPrefillMetadata.TorchairChunkedContextMetadata( + cu_seq_lens=cu_seq_lens, + starts=starts, + seq_tot=seq_tot, + max_seq_lens=max_seq_lens, + workspace=workspace, + chunk_seq_lens=chunk_seq_lens) + + metadata = AscendMLATorchairPrefillMetadata( + attn_mask=torch.tensor([[1, 0], [1, 1]], dtype=torch.bool), + query_lens=[1, 2], + seq_lens=[2, 2], + context_lens=torch.tensor([1, 2]), + input_positions=torch.tensor([0, 1, 0, 1]), + query_start_loc=torch.tensor([0, 1, 3]), + block_table=torch.tensor([[0, 1], [2, 3]]), + max_query_len=2, + max_seq_lens=2, + chunked_context=chunked_context) + + self.assertIsNotNone(metadata.chunked_context) + self.assertIs(metadata.chunked_context.cu_seq_lens, cu_seq_lens) + self.assertIs(metadata.chunked_context.starts, starts) + self.assertEqual(metadata.chunked_context.seq_tot, seq_tot) + self.assertEqual(metadata.chunked_context.max_seq_lens, max_seq_lens) + self.assertIs(metadata.chunked_context.workspace, workspace) + self.assertIs(metadata.chunked_context.chunk_seq_lens, chunk_seq_lens) + + +class TestAscendMLATorchairDecodeMetadata(TestBase): + + def test_ascend_mla_decode_metadata_default(self): + input_positions = torch.tensor([[1, 2, 3, 4], [1, 2, 3, 4]]) + block_table = torch.tensor([[0, 3, 2, 1], [0, 2, 1, 3]]) + seq_lens = torch.tensor([[2], [3]]) + max_seq_lens = 4 + seq_lens_list = [2, 3] + attn_mask = None + + metadata = AscendMLATorchairDecodeMetadata(input_positions, + block_table, seq_lens, + max_seq_lens, seq_lens_list, + attn_mask) + + self.assertIs(metadata.input_positions, input_positions) + self.assertIs(metadata.block_table, block_table) + self.assertIs(metadata.seq_lens, seq_lens) + self.assertEqual(metadata.max_seq_lens, max_seq_lens) + self.assertEqual(metadata.seq_lens_list, seq_lens_list) + self.assertIsNone(attn_mask) + + +class TestAscendMLATorchairMetadata(TestBase): + + def test_ascend_mla_metadata_default(self): + num_actual_tokens = 100 + slot_mapping = torch.randn(100, 4, 1024) + query_start_loc = torch.tensor([1, 2, 3, 4]) + seq_lens = [30, 50] + block_tables = torch.randint(0, 100, (100, 4)) + + num_decodes = 4 + num_decode_tokens = 8 + num_prefills = 8 + + num_input_tokens = 2 + + query_lens = None + head_dim = None + attn_mask = None + attn_state = AscendAttentionState.ChunkedPrefill + + decode = None + prefill = None + + metadata = AscendMLATorchairMetadata( + num_actual_tokens, slot_mapping, query_start_loc, seq_lens, + block_tables, num_decodes, num_decode_tokens, num_prefills, + num_input_tokens, query_lens, head_dim, attn_mask, attn_state, + decode, prefill) + + self.assertEqual(metadata.num_actual_tokens, num_actual_tokens) + self.assertIs(metadata.slot_mapping, slot_mapping) + self.assertIs(metadata.query_start_loc, query_start_loc) + self.assertEqual(metadata.seq_lens, seq_lens) + self.assertIs(metadata.block_tables, block_tables) + self.assertEqual(metadata.num_decodes, num_decodes) + self.assertEqual(metadata.num_decode_tokens, num_decode_tokens) + self.assertEqual(metadata.num_prefills, num_prefills) + self.assertEqual(metadata.num_input_tokens, num_input_tokens) + self.assertEqual(metadata.query_lens, query_lens) + self.assertEqual(metadata.head_dim, head_dim) + self.assertEqual(metadata.attn_mask, attn_mask) + self.assertEqual(metadata.attn_state, attn_state) + self.assertEqual(metadata.decode, decode) + self.assertEqual(metadata.prefill, prefill) + + +class TestAscendMLATorchairMetadataBuilder(TestBase): + + def test_ascend_mla_metadata_builder_default(self): + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.model_config.get_head_size.return_value = 64 + mock_vllm_config.model_config.dtype = torch.float16 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.max_num_seqs = 4 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + ascend_config = MagicMock() + ascend_config.torchair_graph_config = MagicMock() + ascend_config.torchair_graph_config.enabled = True + with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config", + return_value=ascend_config): + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + + self.assertEqual(builder.block_size, + mock_vllm_config.cache_config.block_size) + self.assertEqual( + builder.chunked_prefill_enabled, + mock_vllm_config.scheduler_config.chunked_prefill_enabled) + self.assertEqual(builder.torchair_graph_enabled, True) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_reorder_batch_with_torchair_graph(self, ascend_config): + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.max_num_seqs = 4 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + ascend_config.torchair_graph_config = MagicMock() + ascend_config.torchair_graph_config.enabled = True + + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + + input_batch = MagicMock() + input_batch.req_ids = [0, 1, 2, 3] + + scheduler_output = MagicMock() + scheduler_output.num_scheduled_tokens = {0: 2, 1: 1, 2: 3, 3: 1} + scheduler_output.scheduled_spec_decode_tokens = { + 0: [1], + 1: [], + 2: [1, 1], + 3: [] + } + + input_batch.swap_states = MagicMock() + + modified = builder.reorder_batch(input_batch, scheduler_output) + + self.assertFalse(modified) + input_batch.swap_states.assert_not_called() + + def test_reorder_batch_without_torchair_graph(self): + ascend_config = MagicMock() + ascend_config.torchair_graph_config = MagicMock() + ascend_config.torchair_graph_config.enabled = False + + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.max_num_seqs = 4 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + with patch("vllm_ascend.torchair.torchair_mla.get_ascend_config", + return_value=ascend_config): + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + + input_batch = MagicMock() + input_batch.req_ids = [0, 1, 2, 3] + + scheduler_output = MagicMock() + scheduler_output.num_scheduled_tokens = {0: 1, 1: 3, 2: 1, 3: 2} + scheduler_output.scheduled_spec_decode_tokens = { + 0: [], + 1: [1], + 2: [], + 3: [] + } + + input_batch.swap_states = MagicMock() + + modified = builder.reorder_batch(input_batch, scheduler_output) + + self.assertTrue(modified) + input_batch.swap_states.assert_called_once_with(1, 2) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_get_graph_runner_block_tables_normal(self, mock_ascend_config): + ascend_config = MagicMock() + mock_ascend_config.return_value = ascend_config + ascend_config.torchair_graph_config.enabled = False + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32) + + result = builder._get_graph_runner_block_tables(3, block_tables) + self.assertEqual(result.shape[0], 3) + self.assertEqual(result.shape[1], 64) + self.assertTrue(torch.equal(result[:, :10], block_tables)) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_get_graph_runner_block_tables_truncated(self, mock_ascend_config): + ascend_config = MagicMock() + mock_ascend_config.return_value = ascend_config + ascend_config.torchair_graph_config.enabled = False + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 64 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32) + + result = builder._get_graph_runner_block_tables(3, block_tables) + self.assertEqual(result.shape[0], 3) + self.assertEqual(result.shape[1], 4) + self.assertTrue(torch.equal(result, block_tables[:, :4])) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_get_graph_runner_block_tables_from_numpy(self, + mock_ascend_config): + ascend_config = MagicMock() + mock_ascend_config.return_value = ascend_config + ascend_config.torchair_graph_config.enabled = False + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_device = 'cpu' + + builder = AscendMLATorchairMetadataBuilder(mock_vllm_config, + mock_device) + + block_tables = torch.randint(0, 100, (3, 10), dtype=torch.int32) + + result = builder._get_graph_runner_block_tables(3, block_tables) + + self.assertEqual(result.shape[0], 3) + self.assertEqual(result.shape[1], 64) + self.assertTrue(torch.equal(result[:, :10], block_tables)) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_build_dummy(self, mock_ascend_config): + ascend_config = MagicMock() + mock_ascend_config.return_value = ascend_config + ascend_config.torchair_graph_config.enabled = False + + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_vllm_config.get_head_size.return_value = 64 + mock_vllm_config.model_config.dtype = torch.float16 + mock_device = 'cpu' + + builder = AscendMLATorchairMetadataBuilder( + mock_vllm_config, + mock_device, + metadata_cls=AscendMLATorchairMetadata) + builder.rope_dim = 64 + + with patch.object(builder, + "_get_graph_runner_block_tables", + side_effect=lambda x, y: y): + common_attn_metadata = TorchairCommonAttentionMetadata( + num_reqs=3, + num_actual_tokens=3, + decode_token_per_req=1, + actual_seq_lengths_q=[0, 1, 2], + attn_mask=torch.zeros((1, 1), dtype=torch.bool), + spec_attn_mask=torch.zeros((1, 1), dtype=torch.bool), + ) + metadata = builder.build_torchair_graph_dummy(common_attn_metadata) + + sin_golden = torch.ones(3, + 1, + 1, + 64, + dtype=torch.float16, + device=mock_device) + cos_golden = torch.ones(3, + 1, + 1, + 64, + dtype=torch.float16, + device=mock_device) + + self.assertIsInstance(metadata, AscendMLATorchairMetadata) + self.assertEqual(metadata.num_input_tokens, 3) + self.assertEqual(metadata.num_actual_tokens, 3) + self.assertEqual(metadata.num_decodes, 1) + self.assertEqual(metadata.num_decode_tokens, 1) + self.assertEqual(metadata.num_prefills, 0) + self.assertEqual(metadata.attn_state, AscendAttentionState.DecodeOnly) + self.assertIsNone(metadata.prefill) + self.assertIsInstance(metadata.decode, AscendMLATorchairDecodeMetadata) + self.assertEqual(metadata.block_tables.shape[0], 3) + self.assertEqual(metadata.block_tables.shape[1], 64) + self.assertEqual(metadata.seq_lens.shape[0], 3) + self.assertEqual(metadata.slot_mapping.shape[0], 3) + self.assertEqual(metadata.query_start_loc.shape[0], 3) + assert torch.equal(sin_golden, metadata.decode.sin) + assert torch.equal(cos_golden, metadata.decode.cos) + + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def test_build_decode(self, mock_ascend_config): + ascend_config = MagicMock() + mock_ascend_config.return_value = ascend_config + ascend_config.torchair_graph_config.enabled = False + + mock_vllm_config = MagicMock() + mock_vllm_config.model_config.max_model_len = 1024 + mock_vllm_config.cache_config.block_size = 16 + mock_vllm_config.scheduler_config.chunked_prefill_enabled = False + mock_vllm_config.get_head_size.return_value = 64 + mock_vllm_config.model_config.dtype = torch.float16 + mock_device = 'cpu' + model = MagicMock(spec=nn.Module) + model.model = MagicMock(spec=nn.Module) + + builder = AscendMLATorchairMetadataBuilder( + mock_vllm_config, + mock_device, + metadata_cls=AscendMLATorchairMetadata) + builder.rope_dim = 64 + + builder.sin_cache = torch.tensor([10, 10]) + builder.cos_cache = torch.tensor([10, 10]) + + with patch.object(builder, + "_get_graph_runner_block_tables", + side_effect=lambda x, y: y): + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=torch.tensor([0, 1, 2, 3]), + query_start_loc_cpu=torch.tensor([0, 1, 2, 3]), + seq_lens_cpu=torch.tensor([1, 1, 1]), + num_reqs=3, + num_actual_tokens=3, + max_query_len=1, + decode_token_per_req=torch.tensor([1, 1, 1]), + block_table_tensor=torch.zeros((10, 10)), + slot_mapping_cpu=torch.tensor(range(20)), + actual_seq_lengths_q=torch.tensor([0, 1, 2]), + positions=torch.tensor([1, 1]), + attn_mask=torch.ones((15, 15)), + spec_attn_mask=None, + attn_state=AscendAttentionState.ChunkedPrefill) + + metadata = builder.build(common_attn_metadata, model) + + self.assertIsInstance(metadata, AscendMLATorchairMetadata) + self.assertEqual(metadata.num_input_tokens, 0) + self.assertEqual(metadata.num_actual_tokens, 3) + self.assertEqual(metadata.num_decodes, 3) + self.assertEqual(metadata.num_decode_tokens, 3) + self.assertEqual(metadata.num_prefills, 0) + self.assertEqual(metadata.attn_state, + AscendAttentionState.ChunkedPrefill) + self.assertIsNone(metadata.prefill) + self.assertIsInstance(metadata.decode, AscendMLATorchairDecodeMetadata) + self.assertEqual(metadata.block_tables.shape[0], 3) + self.assertEqual(metadata.block_tables.shape[1], 10) + self.assertEqual(metadata.seq_lens.shape[0], 3) + self.assertEqual(metadata.slot_mapping.shape[0], 3) + self.assertEqual(metadata.query_start_loc.shape[0], 4) + + +class TestAscendMLATorchairImpl(TestBase): + + @patch('vllm.distributed.parallel_state._TP', + new_callable=lambda: MagicMock(spec=GroupCoordinator)) + @patch("vllm.distributed.get_tensor_model_parallel_world_size", + return_value=2) + @patch("vllm.config.get_current_vllm_config") + @patch("vllm_ascend.torchair.torchair_mla.get_ascend_config") + def setUp(self, ascend_config, vllm_config, mock_get_tp_size, mock_tp): + mock_tp.world_size = 2 + ascend_config.torchair_graph_config.enabled = True + ascend_config.torchair_graph_config.enable_kv_nz = False + speculative_config = MagicMock() + speculative_config.num_speculative_tokens = 4 + vllm_config.speculative_config = speculative_config + + num_heads = 256 + head_size = 1024 + scale = 0.1 + num_kv_heads = 8 + kv_cache_dtype = "auto" + + kv_a_layernorm = MagicMock() + kv_a_layernorm.weight = torch.randn(96) + kv_a_layernorm.variance_epsilon = 1e-6 + kwargs = { + "q_lora_rank": 64, + "kv_lora_rank": 32, + "qk_nope_head_dim": 64, + "qk_rope_head_dim": 32, + "qk_head_dim": 96, + "v_head_dim": 128, + "rotary_emb": MagicMock(), + "q_proj": MagicMock(), + "kv_b_proj": MagicMock(), + "o_proj": MagicMock(), + "kv_a_proj_with_mqa": MagicMock(), + "kv_a_layernorm": kv_a_layernorm, + } + + self.impl = AscendMLATorchairImpl(num_heads=num_heads, + head_size=head_size, + scale=scale, + num_kv_heads=num_kv_heads, + alibi_slopes=None, + sliding_window=None, + kv_cache_dtype=kv_cache_dtype, + blocksparse_params=None, + logits_soft_cap=None, + attn_type=None, + kv_sharing_target_layer_name=None, + **kwargs) + + def test_init(self): + self.assertEqual(self.impl.num_heads, 256) + self.assertEqual(self.impl.head_size, 1024) + self.assertEqual(self.impl.scale, 0.1) + self.assertEqual(self.impl.num_kv_heads, 8) + self.assertEqual(self.impl.kv_cache_dtype, "auto") + self.assertEqual(self.impl.q_lora_rank, 64) + self.assertEqual(self.impl.kv_lora_rank, 32) + self.assertEqual(self.impl.qk_nope_head_dim, 64) + self.assertEqual(self.impl.qk_rope_head_dim, 32) + self.assertEqual(self.impl.qk_head_dim, 96) + self.assertEqual(self.impl.v_head_dim, 128) + self.assertIsNotNone(self.impl.rotary_emb) + self.assertIsNotNone(self.impl.q_proj) + self.assertIsNotNone(self.impl.kv_b_proj) + self.assertIsNotNone(self.impl.o_proj) + self.assertIsNotNone(self.impl.kv_a_proj_with_mqa) + self.assertIsNotNone(self.impl.kv_a_layernorm) + self.assertEqual(self.impl.num_queries_per_kv, 32) + self.assertEqual(self.impl.tp_size, 2) + self.assertTrue(self.impl.torchair_graph_enabled) + + def test_v_up_proj_and_o_proj(self): + batch_size = 4 + x = torch.randn(batch_size, self.impl.num_heads, + self.impl.kv_lora_rank) + + self.impl.o_proj.return_value = (torch.randn( + batch_size, self.impl.num_heads * self.impl.v_head_dim), ) + if not hasattr(self.impl, 'W_UV') or self.impl.W_UV is None: + self.impl.W_UV = torch.randn(self.impl.num_heads, + self.impl.kv_lora_rank, + self.impl.v_head_dim) + result = self.impl._v_up_proj_and_o_proj(x) + + self.assertEqual(result.shape[0], batch_size) + self.assertEqual(result.shape[1], + self.impl.num_heads * self.impl.v_head_dim) + + def test_q_proj_and_k_up_proj(self): + batch_size = 4 + x = torch.randn(batch_size, self.impl.num_heads, self.impl.qk_head_dim) + q_proj_output = torch.randn(batch_size, self.impl.num_heads, + self.impl.qk_head_dim) + self.impl.q_proj.return_value = (q_proj_output, ) + if not hasattr(self.impl, 'W_UK_T') or self.impl.W_UK_T is None: + self.impl.W_UK_T = torch.randn(self.impl.num_heads, + self.impl.qk_nope_head_dim, + self.impl.kv_lora_rank) + result = self.impl._q_proj_and_k_up_proj(x) + ql_nope, q_pe = result + self.assertEqual(ql_nope.shape[0], batch_size) + self.assertEqual(ql_nope.shape[1], self.impl.num_heads) + self.assertEqual(ql_nope.shape[2], self.impl.kv_lora_rank) + self.assertEqual(q_pe.shape[0], batch_size) + self.assertEqual(q_pe.shape[1], self.impl.num_heads) + self.assertEqual(q_pe.shape[2], self.impl.qk_rope_head_dim) + + def test_process_weights_after_loading(self): + layer = MagicMock(spec=LinearBase) + layer.input_size_per_partition = 10 + quant_method = MagicMock() + apply = MagicMock() + quant_method.apply = apply + layer.quant_method = quant_method + shape_0 = self.impl.num_heads * (self.impl.qk_nope_head_dim + + self.impl.v_head_dim) + shape_1 = self.impl.kv_lora_rank + layer.weight = torch.randn(shape_0, shape_1) + self.impl.kv_b_proj = layer + apply.return_value = layer.weight.T + self.impl.process_weights_after_loading(torch.bfloat16) + + self.assertEqual(self.impl.W_UK_T.shape[0], self.impl.num_heads) + self.assertEqual(self.impl.W_UK_T.shape[1], self.impl.qk_nope_head_dim) + self.assertEqual(self.impl.W_UK_T.shape[2], self.impl.kv_lora_rank) + + self.assertEqual(self.impl.W_UV.shape[0], self.impl.num_heads) + self.assertEqual(self.impl.W_UV.shape[1], self.impl.kv_lora_rank) + self.assertEqual(self.impl.W_UV.shape[2], self.impl.v_head_dim) + + def test_compute_prefill_context_none(self): + batch_size = 4 + kv_cache = torch.randn(10, 1, 1, 192) + query = torch.randn(batch_size, self.impl.num_heads, + self.impl.qk_head_dim) + metadata = MagicMock() + metadata.prefill = None + prefix_out = torch.randn(2, 16, 128) + prefix_lse = torch.randn(2, 16, 8) + out, lse = self.impl._compute_prefill_context(query, kv_cache, 32, + metadata, prefix_out, + prefix_lse) + + self.assertTrue(torch.equal(prefix_out, out)) + self.assertTrue(torch.equal(prefix_lse, lse)) + + @patch("torch_npu.atb.npu_paged_cache_load") + @patch("torch_npu.atb.npu_ring_mla") + def test_compute_prefill_context(self, mock_ring, mock_load): + S, N, D, VD = 2, self.impl.num_heads, self.impl.qk_head_dim, self.impl.v_head_dim + _, AND = self.impl.qk_rope_head_dim, self.impl.qk_nope_head_dim + latent_kv_dim = self.impl.kv_lora_rank + num_blocks, block_size = 100, 20 + query = torch.randn(S, N, D) + kv_cache_0 = torch.randn(num_blocks, block_size, N, latent_kv_dim) + kv_cache_1 = torch.randn(num_blocks, block_size, N, D) + kv_cache = [kv_cache_0, kv_cache_1] + prefix_out = torch.randn(S, N, 128) + prefix_lse = torch.randn(S, N) + + self.impl.kv_b_proj.return_value = (torch.randn(8, N, VD + AND), ) + + chunk_ctx = MagicMock() + chunk_ctx.seq_tot = [8] + chunk_ctx.chunk_seq_lens = [torch.tensor([8])] + chunk_ctx.starts = [torch.tensor([0])] + + prefill_meta = MagicMock() + prefill_meta.chunked_context = chunk_ctx + prefill_meta.query_lens = [8] + prefill_meta.block_table = torch.randint(0, 100, (S, 4)) + + meta = MagicMock() + meta.prefill = prefill_meta + + out, lse = self.impl._compute_prefill_context(query, kv_cache, 32, + meta, prefix_out, + prefix_lse) + + mock_load.assert_called_once() + mock_ring.assert_called_once() + + self.assertEqual(out.shape, prefix_out.shape) + self.assertEqual(lse.shape, prefix_lse.shape) + + @patch("torch_npu.npu_kv_rmsnorm_rope_cache") + def test_exec_kv(self, mock_kv_cache): + batch_size = 2 + hidden = torch.randn(batch_size, 128) + cos = torch.randn(batch_size, 32) + sin = torch.randn(batch_size, 32) + kv_cache = (torch.randn( + 4, 8, self.impl.kv_lora_rank + self.impl.qk_rope_head_dim), + torch.randn( + 4, 8, + self.impl.kv_lora_rank + self.impl.qk_rope_head_dim)) + slots = torch.arange(batch_size, dtype=torch.long) + + proj_out = torch.randn( + batch_size, self.impl.num_kv_heads, 1, + self.impl.kv_lora_rank + self.impl.qk_rope_head_dim) + self.impl.kv_a_proj_with_mqa.return_value = (proj_out, ) + + mock_kv_cache.return_value = (torch.randn(batch_size, + self.impl.num_kv_heads, 1, + self.impl.qk_rope_head_dim), + torch.randn(batch_size, + self.impl.num_kv_heads, 1, + self.impl.kv_lora_rank), + None, None) + + k_pe, k_nope, kv = self.impl.exec_kv(hidden, cos, sin, kv_cache, slots) + + self.impl.kv_a_proj_with_mqa.assert_called_once_with(hidden) + mock_kv_cache.assert_called_once() + self.assertEqual(k_pe.shape, (batch_size, self.impl.num_kv_heads, 1, + self.impl.qk_rope_head_dim)) + self.assertEqual( + k_nope.shape, + (batch_size, self.impl.num_kv_heads, 1, self.impl.kv_lora_rank)) + self.assertEqual(kv.shape, + (batch_size, self.impl.num_kv_heads, 1, + self.impl.kv_lora_rank + self.impl.qk_rope_head_dim)) + + @patch("torch_npu.npu_kv_rmsnorm_rope_cache") + def test_exec_kv_prefill(self, mock_kv): + B, N, S, H = 2, self.impl.num_kv_heads, 1, 128 + hidden_states = torch.randn(B, N, S, H) + cos = torch.randn(B, S, 32) + sin = torch.randn(B, S, 32) + kv_cache = ( + torch.randn(100, 8, + self.impl.kv_lora_rank + self.impl.qk_rope_head_dim), + torch.randn(100, 8, + self.impl.kv_lora_rank + self.impl.qk_rope_head_dim), + ) + + slots = torch.arange(B * S, dtype=torch.long) + + proj_out = torch.randn( + B, N, S, self.impl.kv_lora_rank + self.impl.qk_rope_head_dim) + self.impl.kv_a_proj_with_mqa.return_value = (proj_out, ) + + mock_kv.return_value = (None, None, + torch.randn(B, self.impl.num_kv_heads, S, + self.impl.qk_rope_head_dim), + torch.randn(B, self.impl.num_kv_heads, S, + self.impl.kv_lora_rank)) + + k_pe, k_nope = self.impl.exec_kv_prefill(hidden_states, cos, sin, + kv_cache, slots) + + self.impl.kv_a_proj_with_mqa.assert_called_once_with(hidden_states) + mock_kv.assert_called_once() + + self.assertEqual( + k_pe.shape, + (B, self.impl.num_kv_heads, S, self.impl.qk_rope_head_dim)) + self.assertEqual( + k_nope.shape, + (B, self.impl.num_kv_heads, S, self.impl.kv_lora_rank)) + + @patch("torch_npu.npu_interleave_rope") + def test_rope_single(self, mock_rope): + B, N, D = 2, 16, 1024 + x = torch.randn(B, N, D) + cos = torch.randn(B, N, 1, D) + sin = torch.randn(B, N, 1, D) + mock_rope.return_value = x.view(B, N, 1, D) + result = self.impl.rope_single(x, cos, sin) + self.assertEqual(result.shape[0], B) + self.assertEqual(result.shape[1], N) + self.assertEqual(result.shape[2], D) + mock_rope.assert_called_once() + + @patch( + "vllm_ascend.torchair.torchair_mla.AscendMLATorchairImpl._v_up_proj_and_o_proj" + ) + @patch("torch_npu._npu_paged_attention_mla") + def test_forward_decode_without_graph(self, mock_page_attention_mla, + mock_up_proj): + self.impl.running_in_graph = False + self.impl.running_chunkprefilll_with_torchair = False + num_tokens = 100 + num_blocks = 256 + block_size = 4 + q_nope = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_nope_head_dim) + q_pe = torch.randn(num_tokens, self.impl.num_heads, + self.impl.qk_rope_head_dim) + kv_c_and_k_pe_cache = torch.randn(num_blocks, block_size, + self.impl.num_heads, + self.impl.kv_lora_rank) + metadata = MagicMock() + metadata.decode = MagicMock() + metadata.decode.block_table = MagicMock() + metadata.decode.seq_lens = 10 + mock_page_attention_mla.return_value = torch.randn( + num_tokens, self.impl.num_heads, self.impl.kv_lora_rank) + mock_up_proj.return_value = torch.randn(num_tokens, + self.impl.num_heads, + self.impl.v_head_dim) + result = self.impl._forward_decode(q_nope, q_pe, None, None, + kv_c_and_k_pe_cache, metadata) + self.assertEqual(result.shape[0], num_tokens) + self.assertEqual(result.shape[1], self.impl.num_heads) + self.assertEqual(result.shape[2], self.impl.v_head_dim) + mock_up_proj.assert_called_once() + mock_page_attention_mla.assert_called_once() + + @patch( + "vllm_ascend.torchair.torchair_mla.AscendMLATorchairImpl._forward_prefill" + ) + @patch("torch_npu._npu_reshape_and_cache") + def test_forward_without_graph(self, _, mock_forward_prefill): + self.impl.running_in_graph = False + self.impl.torchair_graph_enabled = False + + num_tokens = 100 + num_blocks = 256 + block_size = 4 + rotary_emb_return_value = (torch.randn(num_tokens, 16, + self.impl.kv_lora_rank), + torch.randn(0, 1, self.impl.kv_lora_rank)) + self.impl.rotary_emb.side_effect = lambda *args, **kwargs: rotary_emb_return_value + self.impl.o_proj.side_effect = lambda *args, **kwargs: torch.randn( + 1, num_blocks, 128) + + hidden_states_or_q_c = torch.randn(num_tokens, self.impl.q_lora_rank) + hidden_states_or_kv_c_normed = torch.randn(num_tokens, + self.impl.kv_lora_rank) + k_pe = torch.randn(num_tokens, self.impl.qk_rope_head_dim) + kv_cache = (torch.randn(num_blocks, block_size, self.impl.num_heads, + self.impl.kv_lora_rank), + torch.randn(num_blocks, block_size, self.impl.num_heads, + self.impl.qk_rope_head_dim)) + output = torch.randn(num_tokens, self.impl.num_heads, + self.impl.v_head_dim) + + metadata = MagicMock() + metadata.num_decodes = 0 + metadata.num_prefills = num_tokens + mock_forward_prefill.return_value = torch.randn( + 0, self.impl.num_heads * self.impl.v_head_dim) + result = self.impl.forward(None, hidden_states_or_q_c, + hidden_states_or_kv_c_normed, k_pe, + kv_cache, metadata, output, False) + self.assertEqual(result.shape[0], num_tokens) diff --git a/tests/ut/torchair/test_utils.py b/tests/ut/torchair/test_utils.py new file mode 100644 index 0000000..fb526b5 --- /dev/null +++ b/tests/ut/torchair/test_utils.py @@ -0,0 +1,149 @@ +import os +from concurrent.futures import ThreadPoolExecutor +from unittest import mock +from unittest.mock import MagicMock, patch + +import torch + +from tests.ut.base import TestBase +from vllm_ascend.quantization.quantizer import SUPPORT_ASCEND_QUANTIZER_TYPE +from vllm_ascend.torchair import utils + + +class TestTorchairUtils(TestBase): + + def test_get_torchair_current_work_dir(self): + cache_dir = utils.TORCHAIR_CACHE_DIR + work_dir = utils._get_torchair_current_work_dir() + self.assertEqual(cache_dir, work_dir) + work_dir = utils._get_torchair_current_work_dir("test") + self.assertEqual(os.path.join(cache_dir, "test"), work_dir) + + def test_torchair_cache_dir(self): + utils.write_kv_cache_bytes_to_file(0, 100) + self.assertTrue(utils.check_torchair_cache_exist(), + "Create torchair cache dir failed") + self.assertTrue(utils.check_kv_cache_bytes_cache_exist(), + "Create kv cache bytes cache dir failed") + kv_cache_bytes = utils.read_kv_cache_bytes_from_file(0) + self.assertEqual(100, kv_cache_bytes) + utils.delete_torchair_cache_file() + self.assertFalse(utils.check_torchair_cache_exist(), + "Delete torchair cache dir failed") + self.assertFalse(utils.check_kv_cache_bytes_cache_exist(), + "Delete kv cache bytes cache dir failed") + + def test_torchair_cache_dir_multiple_ranks(self): + ranks = [0, 1, 2, 3] + values = [100, 200, 300, 400] + + with ThreadPoolExecutor() as executor: + executor.map(utils.write_kv_cache_bytes_to_file, ranks, values) + for rank, expected in zip(ranks, values): + self.assertEqual(expected, + utils.read_kv_cache_bytes_from_file(rank)) + utils.delete_torchair_cache_file() + + self.assertFalse(utils.check_torchair_cache_exist(), + "Delete torchair cache dir failed") + self.assertFalse(utils.check_kv_cache_bytes_cache_exist(), + "Delete kv cache bytes cache dir failed") + + def test_delete_torchair_cache_file_multiple_times(self): + utils.write_kv_cache_bytes_to_file(0, 100) + utils.delete_torchair_cache_file() + for i in range(5): + try: + utils.delete_torchair_cache_file() + except FileNotFoundError: + self.fail( + f"Unexpected FileNotFoundError on delete call #{i+2}") + + @patch('vllm.ModelRegistry') + def test_register_torchair_model(self, mock_model_registry): + mock_registry = MagicMock() + mock_model_registry.return_value = mock_registry + utils.register_torchair_model() + + self.assertEqual(mock_model_registry.register_model.call_count, 6) + call_args_list = mock_model_registry.register_model.call_args_list + + expected_registrations = [ + ("DeepSeekMTPModel", + "vllm_ascend.torchair.models.torchair_deepseek_mtp:TorchairDeepSeekMTP" + ), + ("DeepseekV2ForCausalLM", + "vllm_ascend.torchair.models.torchair_deepseek_v2:TorchairDeepseekV2ForCausalLM" + ), + ("DeepseekV3ForCausalLM", + "vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM" + ), + ("Qwen2ForCausalLM", + "vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM"), + ("Qwen3MoeForCausalLM", + "vllm_ascend.torchair.models.qwen3_moe:CustomQwen3MoeForCausalLM" + ), + ("PanguProMoEForCausalLM", + "vllm_ascend.torchair.models.torchair_pangu_moe:PanguProMoEForCausalLM" + ) + ] + + for i, (expected_name, + expected_path) in enumerate(expected_registrations): + args, kwargs = call_args_list[i] + self.assertEqual(args[0], expected_name) + self.assertEqual(args[1], expected_path) + + @mock.patch('torch_npu.get_npu_format') + @mock.patch('torch_npu.npu_format_cast') + @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', + new=mock.MagicMock) + def test_converting_weight_acl_format(self, mock_npu_cast, + mock_get_format): + ACL_FORMAT_FRACTAL_NZ = 29 + mock_get_format.return_value = 1 + mock_npu_cast.return_value = 1 + + fused_moe = mock.MagicMock() + fused_moe.w13_weight = mock.MagicMock() + fused_moe.w2_weight = mock.MagicMock() + fused_moe.w13_weight.data = torch.randn(128, 256) + fused_moe.w2_weight.data = torch.randn(256, 128) + model = mock.MagicMock() + model.modules.return_value = [fused_moe] + + utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) + self.assertEqual(fused_moe.w13_weight.data, 1) + + @mock.patch('torch_npu.get_npu_format') + @mock.patch('torch_npu.npu_format_cast') + @mock.patch('vllm.model_executor.layers.fused_moe.layer.FusedMoE', + new=mock.MagicMock) + def test_converting_weight_acl_format_format_true(self, mock_npu_cast, + mock_get_format): + ACL_FORMAT_FRACTAL_NZ = 29 + mock_get_format.return_value = ACL_FORMAT_FRACTAL_NZ + mock_npu_cast.return_value = 1 + + fused_moe = mock.MagicMock() + fused_moe.w13_weight = mock.MagicMock() + fused_moe.w2_weight = mock.MagicMock() + fused_moe.w13_weight.data = torch.randn(128, 256) + fused_moe.w2_weight.data = torch.randn(256, 128) + model = mock.MagicMock() + model.modules.return_value = [fused_moe] + + utils.converting_weight_acl_format(model, ACL_FORMAT_FRACTAL_NZ) + mock_npu_cast.assert_not_called() + + def test_torchair_quant_method_register(self): + + TorchairW8A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[ + "W8A8_DYNAMIC"] + TorchairW4A8DYNAMICQuantizer = SUPPORT_ASCEND_QUANTIZER_TYPE[ + "W4A8_DYNAMIC"] + utils.torchair_quant_method_register() + self.assertNotEqual(TorchairW8A8DYNAMICQuantizer, + SUPPORT_ASCEND_QUANTIZER_TYPE["W8A8_DYNAMIC"]) + self.assertNotEqual(TorchairW4A8DYNAMICQuantizer, + SUPPORT_ASCEND_QUANTIZER_TYPE["W4A8_DYNAMIC"]) diff --git a/tests/ut/worker/test_input_batch.py b/tests/ut/worker/test_input_batch.py new file mode 100644 index 0000000..a72dbdc --- /dev/null +++ b/tests/ut/worker/test_input_batch.py @@ -0,0 +1,372 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import inspect +from collections.abc import Sequence +from typing import Optional + +import numpy as np +import pytest +import torch +from vllm.sampling_params import SamplingParams +from vllm.utils import make_tensor_with_pad +from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import LogitsProcessors +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable + +from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch + +VOCAB_SIZE = 1024 +NUM_OUTPUT_TOKENS = 20 +MAX_PROMPT_SIZE = 100 +MAX_NUM_PROMPT_TOKENS = 64 + + +def _compare_objs(obj1, + obj2, + skip: Sequence = ("logitsprocs", "batch_update_builder")): + attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) + attr_names = set([ + a[0] for a in attrs + if not (a[0].startswith('__') and a[0].endswith('__')) + ]) + for attr_name in attr_names: + if attr_name in skip: + continue + + a = getattr(obj1, attr_name) + b = getattr(obj2, attr_name) + + is_same = False + if isinstance(a, torch.Tensor): + if (a.numel() == 0 or b.numel() == 0): + is_same = (a.numel() == 0 and b.numel() == 0) + elif torch.allclose(a, b): + is_same = True + elif isinstance(a, np.ndarray): + if np.allclose(a, b): + is_same = True + elif isinstance(a, MultiGroupBlockTable): + for a_i, b_i in zip(a.block_tables, b.block_tables): + _compare_objs(a_i, b_i) + is_same = True + elif isinstance(a, (BlockTable, SamplingMetadata, PoolingMetadata)): + _compare_objs(a, b) + is_same = True # if we make it here must be same + elif a == b: + is_same = True + assert is_same, f"Attribute {attr_name} is different"\ + f" in {obj1} and {obj2}: {a} != {b}" + + +def _remove_requests(input_batch: InputBatch, batch_size: int, + reqs: list[CachedRequestState]) -> set[str]: + """ + Remove some requests randomly from the batch and returns + set of request removed + """ + + num_reqs_to_remove = np.random.randint(0, batch_size) + req_indices_to_remove: set[int] = set() + for _ in range(num_reqs_to_remove): + req_index_to_remove = np.random.randint(0, batch_size) + req_indices_to_remove.add(req_index_to_remove) + + req_ids_to_remove: set[str] = set() + for index in req_indices_to_remove: + input_batch.remove_request(reqs[index].req_id) + req_ids_to_remove.add(reqs[index].req_id) + return req_ids_to_remove + + +def _construct_expected_sampling_metadata( + reqs: list[CachedRequestState], + req_ids_retained: set[int], + req_id_index_in_input_batch: dict[str, int], + device: torch.device, +) -> SamplingMetadata: + """ + Constructs and returns the expected SamplingMetadata for this + batch. + """ + num_reqs = len(req_ids_retained) + output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] + prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)] + presence_penalties = [0.0 for _ in range(num_reqs)] + frequency_penalties = [0.0 for _ in range(num_reqs)] + repetition_penalties = [1.0 for _ in range(num_reqs)] + top_k = [0 for _ in range(num_reqs)] + top_p = [0.0 for _ in range(num_reqs)] + temperature = [0.0 for _ in range(num_reqs)] + min_tokens = {} + logit_bias = [None] * num_reqs + allowed_token_ids_mask = torch.zeros(num_reqs, + VOCAB_SIZE, + dtype=torch.bool, + device=device) + bad_words_token_ids = {} + for req in reqs: + if req.req_id not in req_ids_retained: + continue + index_in_input_batch = req_id_index_in_input_batch[req.req_id] + output_token_ids[index_in_input_batch] = req.output_token_ids + prompt_token_ids[index_in_input_batch] = req.prompt_token_ids + presence_penalties[ + index_in_input_batch] = req.sampling_params.presence_penalty + frequency_penalties[index_in_input_batch] = ( + req.sampling_params.frequency_penalty) + repetition_penalties[index_in_input_batch] = ( + req.sampling_params.repetition_penalty) + top_k[index_in_input_batch] = req.sampling_params.top_k + top_p[index_in_input_batch] = req.sampling_params.top_p + temperature[index_in_input_batch] = req.sampling_params.temperature + min_tokens[index_in_input_batch] = ( + req.sampling_params.min_tokens, + req.sampling_params.all_stop_token_ids) + logit_bias[index_in_input_batch] = req.sampling_params.logit_bias + if req.sampling_params.allowed_token_ids: + allowed_token_ids_mask[index_in_input_batch][ + req.sampling_params.allowed_token_ids] = True + if req.sampling_params.bad_words_token_ids: + bad_words_token_ids[ + index_in_input_batch] = req.sampling_params.bad_words_token_ids + + return SamplingMetadata( + temperature=torch.tensor(temperature, dtype=torch.float, + device=device), + all_greedy=False, + all_random=True, + top_p=None if all(x == 1.0 for x in top_p) else torch.tensor( + top_p, dtype=torch.float, device=device), + top_k=None if all(x == 0 for x in top_k) else torch.tensor( + top_k, dtype=torch.int, device=device), + generators={}, + max_num_logprobs=0, + prompt_token_ids=make_tensor_with_pad( + prompt_token_ids, + pad=VOCAB_SIZE, + device=torch.device(device), + dtype=torch.int64, + ), + frequency_penalties=torch.tensor(frequency_penalties, + dtype=torch.float, + device=device), + presence_penalties=torch.tensor(presence_penalties, + dtype=torch.float, + device=device), + repetition_penalties=torch.tensor(repetition_penalties, + dtype=torch.float, + device=device), + output_token_ids=output_token_ids, + no_penalties=(all(x == 0 for x in presence_penalties) + and all(x == 0 for x in frequency_penalties) + and all(x == 1 for x in repetition_penalties)), + allowed_token_ids_mask=allowed_token_ids_mask, + bad_words_token_ids=bad_words_token_ids, + logitsprocs=LogitsProcessors(), + ) + + +def _create_sampling_params(): + return SamplingParams( + top_k=np.random.randint(1, 10), + top_p=np.random.uniform(0.0, 1.0), + presence_penalty=np.random.uniform(-2.0, 2.0), + repetition_penalty=np.random.uniform(0.0, 2.0), + frequency_penalty=np.random.uniform(-2.0, 2.0), + min_tokens=np.random.randint(1, 10), + stop_token_ids=[ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(10)) + ], + logit_bias={0: np.random.uniform(-3.0, 3.0)}, + ) + + +def _construct_cached_request_state(req_id_suffix: int): + prompt_token_ids = [ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(0, MAX_PROMPT_SIZE)) + ] + output_token_ids = [ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS)) + ] + return CachedRequestState( + req_id=f"req_id_{req_id_suffix}", + prompt_token_ids=prompt_token_ids, + sampling_params=_create_sampling_params(), + pooling_params=None, + mm_kwargs=[], + mm_positions=[], + block_ids=([], ), + generator=None, + num_computed_tokens=len(output_token_ids), + output_token_ids=output_token_ids, + mm_hashes=None, + ) + + +@pytest.mark.parametrize("device", ["cpu"]) +@pytest.mark.parametrize("batch_size", [1, 2, 32, 64]) +def test_sampling_metadata_in_input_batch(device: str, batch_size: int): + """ + Tests the logic for managing sampling metadata in the InputBatch. + + This test involves adding a set of requests to the InputBatch, + followed by removing a subset of them. Afterward, the batch is compacted, + and the `make_sampling_metadata` method is invoked on the batch. The + output of `make_sampling_metadata` is then compared against the expected + results to ensure correctness. + + Note: Ignore logits processor logic, which is tested separately + """ + input_batch: InputBatch = InputBatch( + max_num_reqs=batch_size, + max_model_len=1024, + max_num_batched_tokens=1024, + device=torch.device(device), + pin_memory=False, + vocab_size=1024, + block_sizes=[1], + ) + reqs: list[CachedRequestState] = [] + req_id_reqs = {} + req_id_output_token_ids = {} + + # Add requests + for req_index in range(batch_size): + req: CachedRequestState = _construct_cached_request_state(req_index) + assigned_req_index = input_batch.add_request(req) + assert req_index == assigned_req_index + reqs.append(req) + req_id_reqs[req.req_id] = req + req_id_output_token_ids[req.req_id] = req.output_token_ids + + # Remove some requests + req_ids_to_remove = _remove_requests(input_batch, batch_size, reqs) + req_ids_retained = set(req_id_reqs.keys()) - req_ids_to_remove + + # Compact the input batch + input_batch.condense() + + # Generate the sampling metadata + sampling_metadata = input_batch._make_sampling_metadata() + + # Create expected output. + expected_sampling_metadata = _construct_expected_sampling_metadata( + reqs, + req_ids_retained, + input_batch.req_id_to_index, + device=torch.device(device)) + + def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool: + return (t1 is None + and t2 is None) or (t1 is not None and t2 is not None + and torch.allclose(t1, t2)) + + # Assert the actual and expected output. + assert torch.allclose(expected_sampling_metadata.temperature, + sampling_metadata.temperature) + assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p) + assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k) + assert torch.allclose( + expected_sampling_metadata.frequency_penalties, + sampling_metadata.frequency_penalties, + ) + assert torch.allclose( + expected_sampling_metadata.presence_penalties, + sampling_metadata.presence_penalties, + ) + assert torch.allclose( + expected_sampling_metadata.repetition_penalties, + sampling_metadata.repetition_penalties, + ) + assert torch.allclose(expected_sampling_metadata.prompt_token_ids, + sampling_metadata.prompt_token_ids) + assert (expected_sampling_metadata.output_token_ids == + sampling_metadata.output_token_ids) + assert expected_sampling_metadata.no_penalties == \ + sampling_metadata.no_penalties + if sampling_metadata.allowed_token_ids_mask: + assert torch.allclose( + expected_sampling_metadata.allowed_token_ids_mask, + sampling_metadata.allowed_token_ids_mask) + assert expected_sampling_metadata.bad_words_token_ids == \ + sampling_metadata.bad_words_token_ids + + +@pytest.mark.parametrize("device", ["cpu"]) +@pytest.mark.parametrize("batch_size", [32]) +@pytest.mark.parametrize("swap_list", [((0, 1), )]) +def test_swap_states_in_input_batch(device: str, batch_size: int, + swap_list: list): + """ + Tests the logic for managing sampling metadata in the InputBatch. + + This test involves adding a set of requests to the InputBatch, + followed by removing a subset of them. Afterward, the batch is compacted, + and the `make_sampling_metadata` method is invoked on the batch. The + output of `make_sampling_metadata` is then compared against the expected + results to ensure correctness. + + Note: Ignore logits processor logic, which is tested separately + """ + input_batch: InputBatch = InputBatch( + max_num_reqs=batch_size, + max_model_len=1024, + max_num_batched_tokens=1024, + device=torch.device(device), + pin_memory=False, + vocab_size=1024, + block_sizes=[1], + ) + ref_input_batch: InputBatch = InputBatch( + max_num_reqs=batch_size, + max_model_len=1024, + max_num_batched_tokens=1024, + device=torch.device(device), + pin_memory=False, + vocab_size=1024, + block_sizes=[1], + ) + + reqs: list[CachedRequestState] = [] + req_id_reqs = {} + req_id_output_token_ids = {} + # Add requests + for req_index in range(batch_size): + req: CachedRequestState = _construct_cached_request_state(req_index) + assigned_req_index = input_batch.add_request(req) + assert assigned_req_index == req_index + reqs.append(req) + req_id_reqs[req.req_id] = req + req_id_output_token_ids[req.req_id] = req.output_token_ids + + reordered_reqs = reqs.copy() + for swap_pair in swap_list: + reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \ + reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]] + input_batch.swap_states(swap_pair[0], swap_pair[1]) + + for req_index in range(batch_size): + req = reordered_reqs[req_index] + assigned_req_index = ref_input_batch.add_request(req) + assert assigned_req_index == req_index + + input_batch.refresh_metadata() + ref_input_batch.refresh_metadata() + + _compare_objs(input_batch, ref_input_batch) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py new file mode 100644 index 0000000..af3d904 --- /dev/null +++ b/tests/ut/worker/test_worker_v1.py @@ -0,0 +1,1143 @@ +import unittest +from unittest.mock import MagicMock, patch + +import torch +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig + +from tests.ut.base import TestBase + + +class TestNPUWorker(TestBase): + + def setUp(self): + """Setup test environment""" + # Create configuration mocks + self.cache_config_mock = MagicMock(spec=CacheConfig) + self.cache_config_mock.cache_dtype = "auto" + + self.model_config_mock = MagicMock(spec=ModelConfig) + self.model_config_mock.dtype = torch.float16 + self.model_config_mock.trust_remote_code = False + + self.parallel_config_mock = MagicMock(spec=ParallelConfig) + + self.vllm_config_mock = MagicMock(spec=VllmConfig) + self.vllm_config_mock.cache_config = self.cache_config_mock + self.vllm_config_mock.model_config = self.model_config_mock + self.vllm_config_mock.parallel_config = self.parallel_config_mock + self.vllm_config_mock.additional_config = None + self.vllm_config_mock.load_config = None + self.vllm_config_mock.scheduler_config = None + self.vllm_config_mock.device_config = None + self.vllm_config_mock.compilation_config = None + + self.local_rank = 0 + self.rank = 0 + self.distributed_init_method = "tcp://localhost:12345" + self.is_driver_worker = False + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.ops") + @patch("vllm_ascend.worker.worker_v1._register_atb_extensions") + @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") + @patch("vllm_ascend.worker.worker_v1.init_ascend_config") + @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.try_register_lib") + @patch("vllm.utils.init_cached_hf_modules") + @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") + def test_init_npu_worker_normal_case( + self, + mock_init_profiler, + mock_init_cached_hf_modules, + mock_try_register_lib, + mock_init_ascend_soc_version, + mock_init_ascend_config, + mock_register_ascend_customop, + mock_register_atb_extensions, + mock_ops, + mock_adapt_patch, + ): + """Test NPUWorker normal initialization""" + # Setup mock behavior + mock_ops.register_dummy_fusion_op.return_value = None + + # Import and create NPUWorker instance + from vllm_ascend.worker.worker_v1 import NPUWorker + + worker = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) + + # Verify initialization call order + mock_adapt_patch.assert_called_once() + mock_ops.register_dummy_fusion_op.assert_called_once() + mock_register_atb_extensions.assert_called_once() + mock_register_ascend_customop.assert_called_once() + mock_init_ascend_config.assert_called_once_with(self.vllm_config_mock) + mock_init_ascend_soc_version.assert_called_once() + + # Verify try_register_lib call + mock_try_register_lib.assert_called_once_with( + "mindie_turbo", + "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo.", + ) + + # Verify cache_dtype setting + self.assertEqual(worker.cache_dtype, torch.float16) + mock_init_profiler.assert_called_once() + + # Verify init_cached_hf_modules is not called (trust_remote_code=False) + mock_init_cached_hf_modules.assert_not_called() + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.ops") + @patch("vllm_ascend.worker.worker_v1._register_atb_extensions") + @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") + @patch("vllm_ascend.worker.worker_v1.init_ascend_config") + @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.try_register_lib") + @patch("vllm.utils.init_cached_hf_modules") + @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") + def test_init_npu_worker_with_trust_remote_code( + self, + mock_init_profiler, + mock_init_cached_hf_modules, + mock_try_register_lib, + mock_init_ascend_soc_version, + mock_init_ascend_config, + mock_register_ascend_customop, + mock_register_atb_extensions, + mock_ops, + mock_adapt_patch, + ): + """Test NPUWorker initialization with trust_remote_code=True""" + # Set trust_remote_code=True + self.model_config_mock.trust_remote_code = True + mock_ops.register_dummy_fusion_op.return_value = None + + # Create NPUWorker instance + from vllm_ascend.worker.worker_v1 import NPUWorker + + _ = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) + + # Verify init_cached_hf_modules is called (trust_remote_code=True) + mock_init_cached_hf_modules.assert_called_once() + + @patch("vllm_ascend.utils.adapt_patch") + @patch("vllm_ascend.ops") + @patch("vllm_ascend.worker.worker_v1._register_atb_extensions") + @patch("vllm_ascend.worker.worker_v1.register_ascend_customop") + @patch("vllm_ascend.worker.worker_v1.init_ascend_config") + @patch("vllm_ascend.worker.worker_v1.init_ascend_soc_version") + @patch("vllm_ascend.worker.worker_v1.try_register_lib") + @patch("vllm.utils.init_cached_hf_modules") + @patch("vllm_ascend.worker.worker_v1.NPUWorker._init_profiler") + def test_init_npu_worker_with_custom_cache_dtype( + self, + mock_init_profiler, + mock_init_cached_hf_modules, + mock_try_register_lib, + mock_init_ascend_soc_version, + mock_init_ascend_config, + mock_register_ascend_customop, + mock_register_atb_extensions, + mock_ops, + mock_adapt_patch, + ): + """Test NPUWorker initialization with custom cache_dtype""" + # Set custom cache_dtype + self.cache_config_mock.cache_dtype = "float32" + mock_ops.register_dummy_fusion_op.return_value = None + + # Create NPUWorker instance + from vllm_ascend.worker.worker_v1 import NPUWorker + + with patch("vllm.utils.STR_DTYPE_TO_TORCH_DTYPE", + {"float32": torch.float32}): + worker = NPUWorker( + vllm_config=self.vllm_config_mock, + local_rank=self.local_rank, + rank=self.rank, + distributed_init_method=self.distributed_init_method, + is_driver_worker=self.is_driver_worker, + ) + + # Verify cache_dtype is set to custom value + self.assertEqual(worker.cache_dtype, torch.float32) + + def test_initialize_cache(self): + """Test initialize_cache method""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create a simple worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.cache_config = MagicMock() + + # Test initialize_cache + worker.initialize_cache(100, 50) + + # Verify parameter setting + self.assertEqual(worker.cache_config.num_gpu_blocks, 100) + self.assertEqual(worker.cache_config.num_cpu_blocks, 50) + + @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform") + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") + @patch("vllm_ascend.worker.worker_v1.logger") + def test_sleep_mode_enabled(self, mock_logger, mock_allocator_class, + mock_platform, mock_sleep_mode_enabled): + """Test sleep method when sleep mode is enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock + mock_sleep_mode_enabled.return_value = True + mock_platform.mem_get_info.side_effect = [ + (1000, 2000), + (1200, 2000), + ] # before, after + mock_allocator = MagicMock() + mock_allocator_class.get_instance.return_value = mock_allocator + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test sleep method + worker.sleep(level=1) + + # Verify calls + mock_sleep_mode_enabled.assert_called_once() + mock_allocator.sleep.assert_called_once_with( + offload_tags=("weights", )) + self.assertEqual(mock_platform.mem_get_info.call_count, + 2) # Called 2 times in sleep method + # Verify log output + mock_logger.info.assert_called_once() + + @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") + def test_sleep_mode_disabled_raises_error(self, mock_sleep_mode_enabled): + """Test sleep method raises exception when sleep mode is disabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Set sleep mode disabled + mock_sleep_mode_enabled.return_value = False + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test sleep method should raise exception + with self.assertRaises(ValueError) as cm: + worker.sleep() + + self.assertIn("Sleep mode is not enabled", str(cm.exception)) + + @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") + def test_wake_up_mode_enabled(self, mock_allocator_class, + mock_sleep_mode_enabled): + """Test wake_up method when sleep mode is enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock + mock_sleep_mode_enabled.return_value = True + mock_allocator = MagicMock() + mock_allocator_class.get_instance.return_value = mock_allocator + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test wake_up method + worker.wake_up(tags=["test_tag"]) + + # Verify calls + mock_sleep_mode_enabled.assert_called_once() + mock_allocator.wake_up.assert_called_once_with(tags=["test_tag"]) + + @patch("vllm_ascend.worker.worker_v1.sleep_mode_enabled") + def test_wake_up_mode_disabled_raises_error(self, mock_sleep_mode_enabled): + """Test wake_up method raises exception when sleep mode is disabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Set sleep mode disabled + mock_sleep_mode_enabled.return_value = False + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test wake_up method should raise exception + with self.assertRaises(ValueError) as cm: + worker.wake_up() + + self.assertIn("Sleep mode is not enabled", str(cm.exception)) + + @patch( + "vllm_ascend.worker.worker_v1.NPUWorker._init_worker_distributed_environment" + ) + @patch("vllm_ascend.worker.worker_v1.NPUPlatform") + def test_init_device(self, mock_platform, mock_init_dist_env): + """Test _init_device method""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock + mock_platform.mem_get_info.return_value = (1000, 2000) + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.local_rank = 1 + worker.model_config = MagicMock() + worker.model_config.seed = 42 + + # Test _init_device + result = worker._init_device() + + # Verify NPUPlatform.set_device is called + mock_platform.set_device.assert_called_once() + # Verify the parameter passed to set_device is a torch.device object + call_args = mock_platform.set_device.call_args[0][0] + self.assertEqual(str(call_args), "npu:1") + + mock_platform.empty_cache.assert_called_once() + mock_platform.seed_everything.assert_called_once_with(42) + mock_platform.mem_get_info.assert_called_once( + ) # Called once in _init_device method + mock_init_dist_env.assert_called_once( + ) # Verify distributed initialization is called + + # Verify return value is a torch.device object + self.assertEqual(str(result), "npu:1") + self.assertEqual(worker.init_npu_memory, 1000) + + def test_profile_start_stop(self): + """Test profile method start and stop""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + mock_profiler = MagicMock() + worker.profiler = mock_profiler + + # Test start profiler + worker.profile(is_start=True) + mock_profiler.start.assert_called_once() + + # Test stop profiler + worker.profile(is_start=False) + mock_profiler.stop.assert_called_once() + + def test_profile_no_profiler_raises_error(self): + """Test profile method raises exception when profiler is not available""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.profiler = None + + # Test should raise exception + with self.assertRaises(RuntimeError) as cm: + worker.profile() + + self.assertIn("Profiler is not enabled", str(cm.exception)) + + def test_lora_methods(self): + """Test LoRA related methods""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + mock_model_runner = MagicMock() + worker.model_runner = mock_model_runner + + # Set return values + mock_model_runner.add_lora.return_value = True + mock_model_runner.remove_lora.return_value = True + mock_model_runner.list_loras.return_value = {1, 2, 3} + mock_model_runner.pin_lora.return_value = True + + # Test each method + mock_request = MagicMock() + self.assertTrue(worker.add_lora(mock_request)) + mock_model_runner.add_lora.assert_called_once_with(mock_request) + + self.assertTrue(worker.remove_lora(1)) + mock_model_runner.remove_lora.assert_called_once_with(1) + + self.assertEqual(worker.list_loras(), {1, 2, 3}) + mock_model_runner.list_loras.assert_called_once() + + self.assertTrue(worker.pin_lora(2)) + mock_model_runner.pin_lora.assert_called_once_with(2) + + def test_get_methods(self): + """Test various get methods""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + mock_model_runner = MagicMock() + worker.model_runner = mock_model_runner + + # Set return values + mock_model = MagicMock() + mock_kv_cache_spec = {"test": "spec"} + mock_pooling_tasks = ["task1", "task2"] + mock_supported_tasks = ("task1", "task2") + + mock_model_runner.get_model.return_value = mock_model + mock_model_runner.get_kv_cache_spec.return_value = mock_kv_cache_spec + mock_model_runner.get_supported_pooling_tasks.return_value = ( + mock_pooling_tasks) + mock_model_runner.get_supported_tasks.return_value = mock_supported_tasks + + # Test each get method + self.assertEqual(worker.get_model(), mock_model) + self.assertEqual(worker.get_kv_cache_spec(), mock_kv_cache_spec) + self.assertEqual(worker.get_supported_pooling_tasks(), + mock_pooling_tasks) + self.assertEqual(worker.get_supported_tasks(), + mock_supported_tasks) + + def test_execute_dummy_batch(self): + """Test execute_dummy_batch method""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + mock_model_runner = MagicMock() + worker.model_runner = mock_model_runner + + # Test execute_dummy_batch + worker.execute_dummy_batch() + + # Verify call + mock_model_runner._dummy_run.assert_called_once_with(1) + + @patch("vllm_ascend.worker.worker_v1.envs_vllm") + @patch("vllm_ascend.worker.worker_v1.logger") + @patch("torch_npu.profiler._ExperimentalConfig") + @patch("torch_npu.profiler.profile") + @patch("torch_npu.profiler.tensorboard_trace_handler") + @patch("torch_npu.profiler.ExportType") + @patch("torch_npu.profiler.ProfilerLevel") + @patch("torch_npu.profiler.AiCMetrics") + @patch("torch_npu.profiler.ProfilerActivity") + def test_init_profiler_enabled( + self, + mock_profiler_activity, + mock_aic_metrics, + mock_profiler_level, + mock_export_type, + mock_trace_handler, + mock_profile, + mock_experimental_config, + mock_logger, + mock_envs_vllm, + ): + """Test _init_profiler method - profiler enabled case with stack and memory profiling enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Set environment variables to enable profiler + mock_envs_vllm.VLLM_TORCH_PROFILER_DIR = "/path/to/traces" + mock_envs_vllm.VLLM_TORCH_PROFILER_WITH_STACK = True + mock_envs_vllm.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY = True + + # Set enum mocks + mock_export_type.Text = "Text" + mock_profiler_level.Level1 = "Level1" + mock_aic_metrics.AiCoreNone = "AiCoreNone" + mock_profiler_activity.CPU = "CPU" + mock_profiler_activity.NPU = "NPU" + + # Set mock return values + mock_experimental_config_instance = MagicMock() + mock_experimental_config.return_value = mock_experimental_config_instance + mock_trace_handler_instance = MagicMock() + mock_trace_handler.return_value = mock_trace_handler_instance + mock_profiler_instance = MagicMock() + mock_profile.return_value = mock_profiler_instance + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test _init_profiler + result = worker._init_profiler() + + # Verify log output + mock_logger.info.assert_called_once_with( + "Profiling enabled. Traces will be saved to: %s", + "/path/to/traces") + + # Verify ExperimentalConfig creation + mock_experimental_config.assert_called_once() + config_call = mock_experimental_config.call_args + config_kwargs = config_call.kwargs + + # Verify configuration parameters + expected_config = { + "export_type": "Text", + "profiler_level": "Level1", + "msprof_tx": False, + "aic_metrics": "AiCoreNone", + "l2_cache": False, + "op_attr": False, + "data_simplification": False, + "record_op_args": False, + "gc_detect_threshold": None, + } + for key, expected_value in expected_config.items(): + self.assertEqual(config_kwargs[key], expected_value) + + # Verify trace handler creation + mock_trace_handler.assert_called_once_with("/path/to/traces") + + # Verify profiler creation + mock_profile.assert_called_once() + profile_call = mock_profile.call_args + profile_kwargs = profile_call.kwargs + + # Verify profiler parameters + expected_activities = ["CPU", "NPU"] + self.assertEqual(profile_kwargs["activities"], expected_activities) + self.assertTrue(profile_kwargs["with_stack"]) + self.assertTrue(profile_kwargs["profile_memory"]) + self.assertFalse(profile_kwargs["with_modules"]) + self.assertEqual(profile_kwargs["experimental_config"], + mock_experimental_config_instance) + self.assertEqual(profile_kwargs["on_trace_ready"], + mock_trace_handler_instance) + + # Verify return value + self.assertEqual(result, mock_profiler_instance) + + @patch("vllm_ascend.worker.worker_v1.envs_vllm") + def test_init_profiler_disabled(self, mock_envs_vllm): + """Test _init_profiler method - profiler disabled case""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Set environment variable to disable profiler + mock_envs_vllm.VLLM_TORCH_PROFILER_DIR = None + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test _init_profiler + result = worker._init_profiler() + + # Verify returns None + self.assertIsNone(result) + + @patch("vllm_ascend.worker.worker_v1.envs_vllm") + def test_init_profiler_empty_dir(self, mock_envs_vllm): + """Test _init_profiler method - empty directory string case""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Set environment variable to empty string + mock_envs_vllm.VLLM_TORCH_PROFILER_DIR = "" + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + + # Test _init_profiler + result = worker._init_profiler() + + # Verify returns None (empty string is considered false) + self.assertIsNone(result) + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.clear_npu_memory") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.empty_cache") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.mem_get_info") + @patch("torch_npu.npu.memory_stats") + @patch("torch_npu.npu.mem_get_info") + @patch("vllm_ascend.worker.worker_v1.logger") + def test_determine_available_memory_normal_case( + self, + mock_logger, + mock_torch_mem_get_info, + mock_torch_memory_stats, + mock_platform_mem_get_info, + mock_platform_empty_cache, + mock_platform_clear_npu_memory, + ): + """Test determine_available_memory normal case (no non-torch memory allocation)""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock - test case without non-torch memory allocation + mock_platform_mem_get_info.side_effect = [ + (8000, 10000), # 1st call: before profile execution + (7000, 10000), # 2nd call: after profile execution + ] + mock_torch_memory_stats.side_effect = [ + { + "allocated_bytes.all.peak": 2000 + }, # peak memory + { + "allocated_bytes.all.current": 3000 + }, # current allocated = total_allocated_bytes + ] + # Mock setup to simulate memory change between calls, exposing potential race condition + # The implementation calls torch_npu.npu.mem_get_info() twice in total_allocated_bytes calculation + # which is not atomic and can lead to incorrect memory calculations + mock_torch_mem_get_info.side_effect = [ + (7000, 10000), # First call for total_allocated_bytes calculation + ( + 6000, + 10000, + ), # Second call for total_allocated_bytes calculation, simulating an allocation + (6000, 10000), # Additional calls for other parts of the method + (6000, 10000), + ] + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.init_npu_memory = ( + 8500 # Initial memory greater than current free memory + ) + worker.model_runner = MagicMock() + worker.cache_config = MagicMock() + worker.cache_config.gpu_memory_utilization = 0.8 + + # Test determine_available_memory + result = worker.determine_available_memory() + + # Verify call count and order + mock_platform_clear_npu_memory.assert_called_once() + self.assertEqual(mock_platform_mem_get_info.call_count, 2) + worker.model_runner.profile_run.assert_called_once() + mock_platform_empty_cache.assert_called_once() + + # Verify calculation result with race condition simulation + # Calculation logic: + # total_allocated_bytes = torch_npu.npu.mem_get_info()[1] - torch_npu.npu.mem_get_info()[0] + # = 10000 - 7000 = 3000 (first call) + # = 10000 - 6000 = 4000 (second call, memory changed!) + # This exposes the race condition where memory state changes between calls + # non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + # = 4000 - 3000 = 1000 # Non-torch memory allocation detected + # peak_memory = torch_peak_memory + non_torch_allocations + # = 2000 + 1000 = 3000 + # available = total_npu_memory * gpu_memory_utilization - peak_memory + # = 10000 * 0.8 - 3000 = 5000 + expected_result = max(0, int(10000 * 0.8 - 3000)) + self.assertEqual(result, expected_result) + + # Verify log output + mock_logger.info.assert_called_once() + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.clear_npu_memory") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.empty_cache") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.mem_get_info") + @patch("torch_npu.npu.memory_stats") + @patch("torch_npu.npu.mem_get_info") + def test_determine_available_memory_with_non_torch_allocations( + self, + mock_torch_mem_get_info, + mock_torch_memory_stats, + mock_platform_mem_get_info, + mock_platform_empty_cache, + mock_platform_clear_npu_memory, + ): + """Test determine_available_memory with significant non-torch memory allocation""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock - test case with large non-torch memory allocation + mock_platform_mem_get_info.side_effect = [ + (8000, 10000), # 1st call + (7000, 10000), # 2nd call + ] + mock_torch_memory_stats.side_effect = [ + { + "allocated_bytes.all.peak": 1500 + }, # peak memory + { + "allocated_bytes.all.current": 1000 + }, # current allocated + ] + # Mock setup to expose race condition in total_allocated_bytes calculation + # Setup non-torch allocations > 0 case with memory change simulation + mock_torch_mem_get_info.side_effect = [ + (6000, 10000), # First call for total_allocated_bytes calculation + ( + 5000, + 10000, + ), # Second call for total_allocated_bytes calculation, simulating allocation + (5000, 10000), # Additional calls for other parts of the method + (5000, 10000), + ] + + # 创建 worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.init_npu_memory = 8500 + worker.model_runner = MagicMock() + worker.cache_config = MagicMock() + worker.cache_config.gpu_memory_utilization = 0.9 + + # Test determine_available_memory + result = worker.determine_available_memory() + + # Verify result: case with large non-torch memory allocation and race condition + # total_allocated_bytes = torch_npu.npu.mem_get_info()[1] - torch_npu.npu.mem_get_info()[0] + # = 10000 - 6000 = 4000 (first call) + # = 10000 - 5000 = 5000 (second call, memory changed!) + # This exposes the race condition where memory allocation occurs between calls + # non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + # = 5000 - 1000 = 4000 # Significant non-torch allocation detected + # peak_memory = torch_peak_memory + non_torch_allocations + # = 1500 + 4000 = 5500 + # available = total_npu_memory * gpu_memory_utilization - peak_memory + # = 10000 * 0.9 - 5500 = 3500 + expected_result = max(0, int(10000 * 0.9 - 5500)) + self.assertEqual(result, expected_result) + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.clear_npu_memory") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.mem_get_info") + def test_determine_available_memory_memory_profiling_error( + self, mock_platform_mem_get_info, mock_platform_clear_npu_memory): + """Test determine_available_memory throws exception on memory profiling error""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock: initial memory less than current free memory (error case) + mock_platform_mem_get_info.side_effect = [ + (8000, 10000), # 1st call + (9000, 10000), # 2nd call: free memory increased instead + ] + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.init_npu_memory = 8500 # Initial memory < current free memory 9000 + worker.model_runner = MagicMock() + worker.cache_config = MagicMock() + worker.cache_config.gpu_memory_utilization = 0.8 + + # Test should throw exception + with self.assertRaises(AssertionError) as cm: + worker.determine_available_memory() + + self.assertIn("Error in memory profiling", str(cm.exception)) + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.clear_npu_memory") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.empty_cache") + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.mem_get_info") + @patch("torch_npu.npu.memory_stats") + @patch("torch_npu.npu.mem_get_info") + def test_determine_available_memory_negative_result( + self, + mock_torch_mem_get_info, + mock_torch_memory_stats, + mock_platform_mem_get_info, + mock_platform_empty_cache, + mock_platform_clear_npu_memory, + ): + """Test determine_available_memory returns 0 when result is negative""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Setup mock: high peak memory causes negative available memory + mock_platform_mem_get_info.side_effect = [ + (8000, 10000), # 1st call + (3000, 10000), # 2nd call + ] + mock_torch_memory_stats.side_effect = [ + { + "allocated_bytes.all.peak": 9000 + }, # High peak memory + { + "allocated_bytes.all.current": 7000 + }, + ] + # Mock setup to expose race condition even in negative result scenarios + mock_torch_mem_get_info.side_effect = [ + (3000, 10000), # First call for total_allocated_bytes calculation + ( + 2000, + 10000, + ), # Second call for total_allocated_bytes calculation, simulating more allocation + (2000, 10000), # Additional calls for other parts of the method + (2000, 10000), + ] + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.init_npu_memory = 8500 + worker.model_runner = MagicMock() + worker.cache_config = MagicMock() + worker.cache_config.gpu_memory_utilization = 0.8 + + # Test determine_available_memory + result = worker.determine_available_memory() + + # Verify result is 0 (not negative) even with race condition + # total_allocated_bytes = torch_npu.npu.mem_get_info()[1] - torch_npu.npu.mem_get_info()[0] + # = 10000 - 3000 = 7000 (first call) + # = 10000 - 2000 = 8000 (second call, more memory allocated!) + # non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + # = 8000 - 7000 = 1000 # Additional non-torch allocation detected + # peak_memory = torch_peak_memory + non_torch_allocations + # = 9000 + 1000 = 10000 + # available = total_npu_memory * gpu_memory_utilization - peak_memory + # = 10000 * 0.8 - 10000 = -2000, max(0, -2000) = 0 + self.assertEqual(result, 0) + + def test_execute_model_first_rank(self): + """Test execute_model method - first rank case""" + from vllm.v1.outputs import ModelRunnerOutput + + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with ( + patch.object(NPUWorker, "__init__", lambda x, **kwargs: None), + patch("vllm_ascend.worker.worker_v1.get_pp_group") as + mock_get_pp_group, + ): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.parallel_config = MagicMock() + worker.vllm_config.parallel_config.distributed_executor_backend = "ray" + + # Set as first rank + mock_pp_group = MagicMock() + mock_pp_group.is_first_rank = True + mock_pp_group.is_last_rank = True + mock_get_pp_group.return_value = mock_pp_group + + # Mock scheduler_output and return result + mock_scheduler_output = MagicMock() + # Create a real ModelRunnerOutput instance or mock + mock_model_output = MagicMock(spec=ModelRunnerOutput) + worker.model_runner.execute_model.return_value = mock_model_output + + # Test execute_model + result = worker.execute_model(mock_scheduler_output) + + # Verify call + worker.model_runner.execute_model.assert_called_once_with( + mock_scheduler_output, None) + self.assertEqual(result, mock_model_output) + + @patch("vllm_ascend.worker.worker_v1.get_pp_group") + @patch("vllm_ascend.worker.worker_v1.get_tp_group") + @patch("vllm_ascend.worker.worker_v1.has_kv_transfer_group") + def test_execute_model_middle_rank(self, mock_has_kv_transfer_group, + mock_get_tp_group, mock_get_pp_group): + """Test execute_model method - middle rank case""" + from vllm.sequence import IntermediateTensors + + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.parallel_config = MagicMock() + worker.vllm_config.parallel_config.distributed_executor_backend = "ray" + + # Set as middle rank (not first, not last) + mock_pp_group = MagicMock() + mock_pp_group.is_first_rank = False + mock_pp_group.is_last_rank = False + mock_get_pp_group.return_value = mock_pp_group + + # Setup tensor reception data + mock_pp_group.recv_tensor_dict.return_value = {"tensor": "data"} + + # Mock return IntermediateTensors - use real type + mock_intermediate_output = MagicMock(spec=IntermediateTensors) + mock_intermediate_output.tensors = {"output_tensor": "data"} + mock_intermediate_output.kv_connector_output = ( + None # Set to None to trigger return None + ) + worker.model_runner.execute_model.return_value = mock_intermediate_output + + # Set has_kv_transfer_group returns False + mock_has_kv_transfer_group.return_value = False + + mock_scheduler_output = MagicMock() + + # Test execute_model + result = worker.execute_model(mock_scheduler_output) + + # Verify tensor reception + mock_pp_group.recv_tensor_dict.assert_called_once() + + # Verify model execution with intermediate_tensors + # Second parameter should be IntermediateTensors instance + worker.model_runner.execute_model.assert_called_once() + args, kwargs = worker.model_runner.execute_model.call_args + self.assertEqual(args[0], mock_scheduler_output) + self.assertIsInstance(args[1], IntermediateTensors) + + # Verify tensor sending + mock_pp_group.send_tensor_dict.assert_called_once() + + # Middle rank without kv_transfer_group should return None + self.assertIsNone(result) + + def test_execute_model_external_launcher(self): + """Test execute_model method - external_launcher mode""" + from vllm.v1.outputs import ModelRunnerOutput + + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with ( + patch.object(NPUWorker, "__init__", lambda x, **kwargs: None), + patch("vllm_ascend.worker.worker_v1.get_pp_group") as + mock_get_pp_group, + ): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.parallel_config = MagicMock() + worker.vllm_config.parallel_config.distributed_executor_backend = ( + "external_launcher") + + # Set as non-last rank + mock_pp_group = MagicMock() + mock_pp_group.is_first_rank = True + mock_pp_group.is_last_rank = False + mock_get_pp_group.return_value = mock_pp_group + + # Mock return result + mock_scheduler_output = MagicMock() + mock_model_output = MagicMock(spec=ModelRunnerOutput) + worker.model_runner.execute_model.return_value = mock_model_output + + # Test execute_model + result = worker.execute_model(mock_scheduler_output) + + # In external_launcher mode, it doesn't enter middle processing logic, returns result directly + self.assertEqual(result, mock_model_output) + + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") + def test_load_model_with_sleep_mode(self, mock_allocator_class): + """Test load_model method - with sleep mode enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.model_config = MagicMock() + worker.vllm_config.model_config.enable_sleep_mode = True + + # Setup allocator mock + mock_allocator = MagicMock() + mock_allocator.get_current_usage.return_value = 0 + mock_context = MagicMock() + mock_allocator.use_memory_pool.return_value = mock_context + mock_allocator_class.get_instance.return_value = mock_allocator + + # Test load_model + worker.load_model() + + # Verify calls + mock_allocator_class.get_instance.assert_called_once() + mock_allocator.get_current_usage.assert_called_once() + mock_allocator.use_memory_pool.assert_called_once_with( + tag="weights") + worker.model_runner.load_model.assert_called_once() + + def test_load_model_without_sleep_mode(self): + """Test load_model method - without sleep mode enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.model_config = MagicMock() + worker.vllm_config.model_config.enable_sleep_mode = False + + # Test load_model + worker.load_model() + + # Verify calls + worker.model_runner.load_model.assert_called_once() + + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") + def test_load_model_sleep_mode_assertion_error(self, mock_allocator_class): + """Test load_model method - assertion error in sleep mode""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.model_config = MagicMock() + worker.vllm_config.model_config.enable_sleep_mode = True + + # Setup allocator mock - current usage is not 0 + mock_allocator = MagicMock() + mock_allocator.get_current_usage.return_value = 100 # Non-zero value + mock_allocator_class.get_instance.return_value = mock_allocator + + # Test should throw assertion error + with self.assertRaises(AssertionError) as cm: + worker.load_model() + + self.assertIn("Sleep mode can only be", str(cm.exception)) + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything") + @patch("vllm_ascend.worker.worker_v1.logger") + def test_compile_or_warm_up_model_with_eager_mode(self, mock_logger, + mock_seed_everything): + """Test compile_or_warm_up_model method - eager mode""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.model_config = MagicMock() + worker.model_config.enforce_eager = True + worker.model_config.seed = 12345 + + # Setup compilation config + worker.vllm_config.compilation_config = MagicMock() + worker.vllm_config.compilation_config.compile_sizes = [1, 4, 8, 16] + worker.vllm_config.compilation_config.cudagraph_capture_sizes = [ + 4, 8 + ] + + # Test compile_or_warm_up_model + worker.compile_or_warm_up_model() + + # Verify _dummy_run call count and order (by size descending) + expected_calls = [ + unittest.mock.call(16), + unittest.mock.call(8), + unittest.mock.call(4), + unittest.mock.call(1), + ] + worker.model_runner._dummy_run.assert_has_calls(expected_calls) + + # Should not call capture_model in eager mode + worker.model_runner.capture_model.assert_not_called() + + # Verify log output + self.assertEqual(mock_logger.info.call_count, 4) + + # Verify seed setting + mock_seed_everything.assert_called_once_with(12345) + + @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything") + @patch("vllm_ascend.worker.worker_v1.logger") + def test_compile_or_warm_up_model_with_graph_capture( + self, mock_logger, mock_seed_everything): + """Test compile_or_warm_up_model method - with graph capture enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.model_config = MagicMock() + worker.model_config.enforce_eager = False # Enable graph capture + worker.model_config.seed = 67890 + + # Setup compilation config + worker.vllm_config.compilation_config = MagicMock() + worker.vllm_config.compilation_config.compile_sizes = [1, 4, 8, 16] + worker.vllm_config.compilation_config.cudagraph_capture_sizes = [ + 4, 8 + ] + + # Test compile_or_warm_up_model + worker.compile_or_warm_up_model() + + # Verify only call _dummy_run for sizes not in cudagraph_capture_sizes + expected_calls = [unittest.mock.call(16), unittest.mock.call(1)] + worker.model_runner._dummy_run.assert_has_calls(expected_calls) + + # Should call capture_model in non-eager mode + worker.model_runner.capture_model.assert_called_once() + + # Verify seed setting + mock_seed_everything.assert_called_once_with(67890) + + @patch("vllm_ascend.worker.worker_v1.CaMemAllocator") + def test_initialize_from_config_with_sleep_mode(self, + mock_allocator_class): + """Test initialize_from_config method - with sleep mode enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.model_config = MagicMock() + worker.vllm_config.model_config.enable_sleep_mode = True + + # Setup allocator mock + mock_allocator = MagicMock() + mock_context = MagicMock() + mock_allocator.use_memory_pool.return_value = mock_context + mock_allocator_class.get_instance.return_value = mock_allocator + + # Create mock kv_cache_config + mock_kv_cache_config = MagicMock() + + # Test initialize_from_config + worker.initialize_from_config(mock_kv_cache_config) + + # Verify calls + mock_allocator_class.get_instance.assert_called_once() + mock_allocator.use_memory_pool.assert_called_once_with( + tag="kv_cache") + worker.model_runner.initialize_kv_cache.assert_called_once_with( + mock_kv_cache_config) + + def test_initialize_from_config_without_sleep_mode(self): + """Test initialize_from_config method - without sleep mode enabled""" + from vllm_ascend.worker.worker_v1 import NPUWorker + + # Create worker mock + with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): + worker = NPUWorker() + worker.model_runner = MagicMock() + worker.vllm_config = MagicMock() + worker.vllm_config.model_config = MagicMock() + worker.vllm_config.model_config.enable_sleep_mode = False + + # Create mock kv_cache_config + mock_kv_cache_config = MagicMock() + + # Test initialize_from_config + worker.initialize_from_config(mock_kv_cache_config) + + # Verify calls + worker.model_runner.initialize_kv_cache.assert_called_once_with( + mock_kv_cache_config) diff --git a/tools/actionlint.sh b/tools/actionlint.sh new file mode 100755 index 0000000..d1950db --- /dev/null +++ b/tools/actionlint.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# +export SHELLCHECK_OPTS="--exclude=SC2046,SC2006,SC2086" + +if command -v actionlint &> /dev/null; then + actionlint .github/workflows/*.yml .github/workflows/*.yaml + exit 0 +elif [ -x ./actionlint ]; then + ./actionlint .github/workflows/*.yml .github/workflows/*.yaml + exit 0 +fi + +# download a binary to the current directory - v1.7.3 +bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) +./actionlint .github/workflows/*.yml .github/workflows/*.yaml diff --git a/tools/check_python_src_init.py b/tools/check_python_src_init.py new file mode 100644 index 0000000..2c7e0a9 --- /dev/null +++ b/tools/check_python_src_init.py @@ -0,0 +1,76 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# +import os +import sys + +VLLM_ASCEND_SRC = "vllm_ascend" +# TODO: Re-enable this after upstream fixed +# VLLM_SRC = "vllm-empty/vllm" + + +def check_init_file_in_package(directory): + """ + Check if a Python package directory contains __init__.py file. + A directory is considered a Python package if it contains `.py` files and an `__init__.py` file. + """ + try: + files = os.listdir(directory) + except FileNotFoundError: + print(f"Warning: Directory does not exist: {directory}") + return False + + # If any .py file exists, we expect an __init__.py + if any(f.endswith('.py') for f in files): + init_file = os.path.join(directory, '__init__.py') + if not os.path.isfile(init_file): + return False + return True + + +def find_missing_init_dirs(src_dir): + """ + Walk through the src_dir and return subdirectories missing __init__.py. + """ + missing_init = set() + for dirpath, _, _ in os.walk(src_dir): + if not check_init_file_in_package(dirpath): + missing_init.add(dirpath) + return missing_init + + +def main(): + all_missing = set() + + for src in [VLLM_ASCEND_SRC]: + missing = find_missing_init_dirs(src) + all_missing.update(missing) + + if all_missing: + print( + "❌ Missing '__init__.py' files in the following Python package directories:" + ) + for pkg in sorted(all_missing): + print(f" - {pkg}") + sys.exit(1) + else: + print("✅ All Python packages have __init__.py files.") + + +if __name__ == "__main__": + main() diff --git a/tools/check_repo.sh b/tools/check_repo.sh new file mode 100644 index 0000000..7b01da0 --- /dev/null +++ b/tools/check_repo.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) + +if ! git diff --quiet; then + echo "Repo is dirty" >&2 + + exit 1 +fi + +if ! git describe --tags; then + echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 + + exit 1 +fi diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py new file mode 100644 index 0000000..92e6f79 --- /dev/null +++ b/tools/enforce_regex_import.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import regex as re + +FORBIDDEN_PATTERNS = re.compile( + r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)') +ALLOWED_PATTERNS = [ + re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'), + re.compile(r'^\s*import\s+regex\s*$'), +] + + +def get_staged_python_files() -> list[str]: + try: + result = subprocess.run( + ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'], + capture_output=True, + text=True, + check=True) + files = result.stdout.strip().split( + '\n') if result.stdout.strip() else [] + return [f for f in files if f.endswith('.py')] + except subprocess.CalledProcessError: + return [] + + +def is_forbidden_import(line: str) -> bool: + line = line.strip() + return bool( + FORBIDDEN_PATTERNS.match(line) + and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS)) + + +def check_file(filepath: str) -> list[tuple[int, str]]: + violations = [] + try: + with open(filepath, encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if is_forbidden_import(line): + violations.append((line_num, line.strip())) + except (OSError, UnicodeDecodeError): + pass + return violations + + +def main() -> int: + files = get_staged_python_files() + if not files: + return 0 + + total_violations = 0 + + for filepath in files: + if not Path(filepath).exists(): + continue + + if filepath == "setup.py": + continue + + violations = check_file(filepath) + if violations: + print(f"\n❌ {filepath}:") + for line_num, line in violations: + print(f" Line {line_num}: {line}") + total_violations += 1 + + if total_violations > 0: + print(f"\n💡 Found {total_violations} violation(s).") + print("❌ Please replace 'import re' with 'import regex as re'") + print( + " Also replace 'from re import ...' with 'from regex import ...'" + ) # noqa: E501 + print("✅ Allowed imports:") + print(" - import regex as re") + print(" - import regex") # noqa: E501 + return 1 + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/mypy.sh b/tools/mypy.sh new file mode 100755 index 0000000..bf9bc77 --- /dev/null +++ b/tools/mypy.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +CI=${1:-0} +PYTHON_VERSION=${2:-local} + +if [ "$CI" -eq 1 ]; then + set -e +fi + +if [ $PYTHON_VERSION == "local" ]; then + PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +fi + +run_mypy() { + echo "Running mypy on $1" + mypy --check-untyped-defs --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" +} + +run_mypy vllm_ascend +run_mypy examples +run_mypy tests diff --git a/tools/png-lint.sh b/tools/png-lint.sh new file mode 100755 index 0000000..3eb7667 --- /dev/null +++ b/tools/png-lint.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +# Ensure that *.excalidraw.png files have the excalidraw metadata +# embedded in them. This ensures they can be loaded back into +# the tool and edited in the future. + +find . -iname '*.excalidraw.png' | while read -r file; do + if git check-ignore -q "$file"; then + continue + fi + if ! grep -q "excalidraw+json" "$file"; then + echo "$file was not exported from excalidraw with 'Embed Scene' enabled." + exit 1 + fi +done diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh new file mode 100755 index 0000000..e0833f3 --- /dev/null +++ b/tools/shellcheck.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +set -e + +scversion="stable" + +if [ -d "shellcheck-${scversion}" ]; then + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +if ! [ -x "$(command -v shellcheck)" ]; then + if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then + echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" + exit 1 + fi + + # automatic local install if linux x86_64 + wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +# should enable this +# find . -path ./.git -prune -o -name "*.sh" -print0 \ +# | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh new file mode 100755 index 0000000..2bedf37 --- /dev/null +++ b/tools/sphinx-lint.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# + +sphinx-lint --disable trailing-whitespace,missing-final-newline docs diff --git a/typos.toml b/typos.toml new file mode 100644 index 0000000..bd75b50 --- /dev/null +++ b/typos.toml @@ -0,0 +1,177 @@ +[files] +# these files may be written in non english words +extend-exclude = [] +ignore-hidden = true +ignore-files = true +ignore-dot = true +ignore-vcs = true +ignore-global = true +ignore-parent = true + +[default] +binary = false +check-filename = false +check-file = true +unicode = true +ignore-hex = true +identifier-leading-digits = false +locale = "en" +extend-ignore-identifiers-re = [".*Unc.*", ".*_thw", + ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*", + ".*ot.*", ".*[Tt]h[rR].*"] +extend-ignore-words-re = ["CANN", "cann"] +extend-ignore-re = [] + +[default.extend-identifiers] +nd_to_nz_2d = "nd_to_nz_2d" +bbc5b7ede = "bbc5b7ede" +womens_doubles = "womens_doubles" +v_2nd = "v_2nd" +splitted_input = "splitted_input" +NOOPs = "NOOPs" +typ = "typ" +nin_shortcut = "nin_shortcut" +UperNetDecoder = "UperNetDecoder" +subtile = "subtile" +SFOuput = "SFOuput" +# huggingface transformers repo uses these words +depthwise_seperable_out_channel = "depthwise_seperable_out_channel" +DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d" +depthwise_seperable_CNN = "depthwise_seperable_CNN" + +[default.extend-words] +iy = "iy" +tendencias = "tendencias" +# intel cpu features +tme = "tme" +dout = "dout" +Pn = "Pn" +arange = "arange" + +[type.py] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.py.extend-identifiers] +arange = "arange" +NDArray = "NDArray" +EOFError = "EOFError" + +[type.py.extend-words] + +[type.cpp] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cpp.extend-identifiers] +countr_one = "countr_one" + +[type.cpp.extend-words] + +[type.rust] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.rust.extend-identifiers] +flate2 = "flate2" + +[type.rust.extend-words] +ser = "ser" + +[type.lock] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.lock.extend-identifiers] + +[type.lock.extend-words] + +[type.jl] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.jl.extend-identifiers] + +[type.jl.extend-words] +modul = "modul" +egals = "egals" +usig = "usig" +egal = "egal" + +[type.go] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.go.extend-identifiers] +flate = "flate" + +[type.go.extend-words] + +[type.css] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.css.extend-identifiers] +nd = "nd" + +[type.css.extend-words] + +[type.man] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.man.extend-identifiers] +Nd = "Nd" + +[type.man.extend-words] + +[type.cert] +extend-glob = [] +check-file = false +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.cert.extend-identifiers] + +[type.cert.extend-words] + +[type.sh] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.sh.extend-identifiers] +stap = "stap" +ot = "ot" + +[type.sh.extend-words] + +[type.vimscript] +extend-glob = [] +extend-ignore-identifiers-re = [] +extend-ignore-words-re = [] +extend-ignore-re = [] + +[type.vimscript.extend-identifiers] +windo = "windo" + +[type.vimscript.extend-words] diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py new file mode 100644 index 0000000..7588e70 --- /dev/null +++ b/vllm_ascend/__init__.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + + +def register(): + """Register the NPU platform.""" + + return "vllm_ascend.platform.NPUPlatform" + + +def register_model(): + from .models import register_model + register_model() diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py new file mode 100644 index 0000000..e46cd9a --- /dev/null +++ b/vllm_ascend/ascend_config.py @@ -0,0 +1,215 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +from vllm.logger import logger + +TORCHAIR_MODEL_LIST = ["deepseek", "pangu", "kimi_k2", "qwen"] + + +def _check_torchair_supported(model_type: str): + for supported_model in TORCHAIR_MODEL_LIST: + if supported_model in model_type.lower(): + return True + return False + + +class AscendConfig: + """ + Configuration Object for additional_config from vllm.configs. + """ + + def __init__(self, vllm_config): + additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {} + + torchair_graph_config = additional_config.get("torchair_graph_config", + {}) + self.torchair_graph_config = TorchairGraphConfig(torchair_graph_config) + + ascend_scheduler_config = additional_config.get( + "ascend_scheduler_config", {}) + self.ascend_scheduler_config = AscendSchedulerConfig( + ascend_scheduler_config) + + self.expert_map_path = additional_config.get("expert_map_path", None) + self.chunked_prefill_for_mla = additional_config.get( + "chunked_prefill_for_mla", False) + self.enable_shared_expert_dp = additional_config.get( + "enable_shared_expert_dp", False + ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel + self.enable_prefetch = additional_config.get("enable_prefetch", False) + self.lmhead_tensor_parallel_size = additional_config.get( + "lmhead_tensor_parallel_size", None) + if self.lmhead_tensor_parallel_size is not None: + logger.info( + f"Enable lmhead_tensor_parallel_size={self.lmhead_tensor_parallel_size} in pure DP scenario" + ) + if vllm_config.parallel_config.tensor_parallel_size != 1: + raise AssertionError( + "lmhead_tensor_parallel_size is only supported in the pure DP scenario" + ) + + +class TorchairGraphConfig: + """ + Configuration Object for torchair_graph_config from additional_config + """ + + def __init__(self, torchair_graph_config): + self.enabled = torchair_graph_config.get("enabled", False) + self.mode = torchair_graph_config.get("mode", '') + self.use_cached_graph = torchair_graph_config.get( + "use_cached_graph", False) + self.use_cached_kv_cache_bytes = torchair_graph_config.get( + "use_cached_kv_cache_bytes", False) + self.graph_batch_sizes = torchair_graph_config.get( + "graph_batch_sizes", []) + self.graph_batch_sizes_init = torchair_graph_config.get( + "graph_batch_sizes_init", False) + self.enable_multistream_mla = torchair_graph_config.get( + "enable_multistream_mla", False) + self.enable_multistream_moe = torchair_graph_config.get( + "enable_multistream_moe", False) + self.enable_view_optimize = torchair_graph_config.get( + "enable_view_optimize", True) + self.enable_kv_nz = torchair_graph_config.get("enable_kv_nz", False) + + if not isinstance(self.graph_batch_sizes, list): + raise TypeError("graph_batch_sizes must be list[int]") + if self.graph_batch_sizes_init and len(self.graph_batch_sizes) > 0: + raise ValueError( + "graph_batch_sizes_init is only valid when graph_batch_sizes is empty" + ) + if not self.enabled: + if self.mode: + raise RuntimeError( + "mode is valid only when Torchair graph mode is enabled") + if self.use_cached_graph: + raise RuntimeError( + "use_cached_graph is valid only when Torchair graph mode is enabled" + ) + if self.use_cached_kv_cache_bytes: + raise RuntimeError( + "use_cached_kv_cache_bytes is valid only when Torchair graph mode is enabled" + ) + if self.graph_batch_sizes: + raise RuntimeError( + "graph_batch_sizes is valid only when Torchair graph mode is enabled" + ) + if self.graph_batch_sizes_init: + raise RuntimeError( + "graph_batch_sizes_init is valid only when Torchair graph mode is enabled" + ) + if self.enable_multistream_mla: + raise RuntimeError( + "enable_multistream_mla is valid only when Torchair graph mode is enabled" + ) + if self.enable_multistream_moe: + raise RuntimeError( + "enable_multistream_moe is valid only when Torchair graph mode is enabled" + ) + if self.enable_kv_nz: + raise RuntimeError( + "enable_kv_nz is valid only when Torchair graph mode is enabled" + ) + if self.use_cached_kv_cache_bytes and not self.use_cached_graph: + raise RuntimeError( + "use_cached_kv_cache_bytes is valid only when Torchair graph mode and use_cached_graph are enabled" + ) + + +class AscendSchedulerConfig: + """ + Configuration Object for ascend_scheduler_config from additional_config + """ + + def __init__(self, ascend_scheduler_config: dict): + self.enabled = ascend_scheduler_config.get("enabled", False) + # Ascend scheduler is based on vllm v0 scheduler, so we should support + # all vllm v0 scheduler configs as well. + for k, v in ascend_scheduler_config.items(): + if not hasattr(self, k): + setattr(self, k, v) + + +_ASCEND_CONFIG: Optional[AscendConfig] = None + + +def init_ascend_config(vllm_config): + additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {} + refresh = additional_config.get("refresh", + False) if additional_config else False + global _ASCEND_CONFIG + if _ASCEND_CONFIG is not None and not refresh: + return _ASCEND_CONFIG + _ASCEND_CONFIG = AscendConfig(vllm_config) + return _ASCEND_CONFIG + + +def clear_ascend_config(): + global _ASCEND_CONFIG + _ASCEND_CONFIG = None + + +def get_ascend_config(): + global _ASCEND_CONFIG + if _ASCEND_CONFIG is None: + raise RuntimeError( + "Ascend config is not initialized. Please call init_ascend_config first." + ) + return _ASCEND_CONFIG + + +def check_ascend_config(vllm_config, enforce_eager): + ascend_config = get_ascend_config() + + # for eager mode + if enforce_eager: + # torchair_graph cannot be enabled with eager mode. + if ascend_config.torchair_graph_config.enabled: + raise RuntimeError( + "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode." + ) + # for graph mode + else: + # torchair_graph case + if ascend_config.torchair_graph_config.enabled: + # torchair_graph is supported for deepseek/pangu/qwen model only. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if not _check_torchair_supported(model_type): + raise NotImplementedError( + "Torchair graph mode only works with following model types:" + f"{TORCHAIR_MODEL_LIST}.") + if ascend_config.enable_shared_expert_dp: + logger.warning( + "enable_shared_expert_dp is not supported for torchair graph mode currently, " + "it has been disabled automatically.") + # aclgraph case + else: + # aclgraph doesn't work with deepseek model and only qwen model is well tested. + if vllm_config.model_config: + model_type = vllm_config.model_config.hf_config.model_type + if "deepseek" in model_type: + raise NotImplementedError( + "ACL Graph does not support deepseek. Please " + "try torchair graph mode to serve deepseek models on vllm-ascend." + " Or set `enforce_eager=True` to use eager mode.") + if "qwen" not in model_type: + logger.warning( + "ACL Graph is currently experimental. Please " + "raise an issue on https://github.com/vllm-project/vllm-ascend/issues" + " if you encourage any Error") diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py new file mode 100644 index 0000000..601f33a --- /dev/null +++ b/vllm_ascend/ascend_forward_context.py @@ -0,0 +1,138 @@ +import math +from contextlib import contextmanager +from enum import Enum +from typing import Any, Optional + +import torch +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.distributed import (get_dp_group, get_ep_group, + get_tensor_model_parallel_world_size) +from vllm.forward_context import (BatchDescriptor, get_forward_context, + set_forward_context) + +import vllm_ascend.envs as envs_ascend + + +class FusedMoEState(Enum): + AllGather = 0 + All2All = 1 + MC2 = 2 + AllGatherEP = 3 + NaiveMulticast = 4 + All2AllSeq = 5 + + +# TODO(zzzzwwjj): add soc_version to choose branch +def _get_fused_moe_state(ep_size: int, with_prefill: bool, + is_deepseek_v3_r1: bool): + # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep + # only supports deepseek v3/r1 + if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 + and is_deepseek_v3_r1): + return FusedMoEState.AllGatherEP + elif ep_size == 1: + if with_prefill: + return FusedMoEState.NaiveMulticast + else: + return FusedMoEState.AllGather + # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph. + elif ep_size < 16 or with_prefill: + return FusedMoEState.All2All + else: + return FusedMoEState.MC2 + + +def get_dispatcher_name(ep_size: int, with_prefill: bool) -> str: + if ep_size == 1: + return "TokenDispatcherWithAllGather" + + if ep_size < 16: + return "TokenDispatcherWithAll2AllV" + + if with_prefill: + return "TokenDispatcherWithAll2AllV" + return "TokenDispatcherWithMC2" + + +@contextmanager +def set_ascend_forward_context( + attn_metadata: Any, + vllm_config: VllmConfig, + virtual_engine: int = 0, + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None, + with_prefill: bool = True, + in_profile_run: bool = False, + reserved_mc2_mask: Optional[torch.Tensor] = None, + moe_comm_method: str = "", + num_actual_tokens: Optional[int] = None, + aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + batch_descriptor: Optional[BatchDescriptor] = None): + """A context manager that stores the current forward context, + can be attention metadata, etc. + We add some additional param into forward_context. + """ + with set_forward_context( + attn_metadata, + vllm_config, + virtual_engine=virtual_engine, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=aclgraph_runtime_mode, + batch_descriptor=batch_descriptor, + ): + forward_context = get_forward_context() + forward_context.moe_comm_method_name = moe_comm_method + "commimpl" + forward_context.with_prefill = with_prefill + ep_size = (get_ep_group().world_size if + vllm_config.parallel_config.enable_expert_parallel else 1) + + is_deepseek_v3_r1 = hasattr( + vllm_config.model_config.hf_config, 'n_routed_experts' + ) and vllm_config.model_config.hf_config.n_routed_experts == 256 + fused_moe_state = _get_fused_moe_state(ep_size, with_prefill, + is_deepseek_v3_r1) + forward_context.fused_moe_state = fused_moe_state + forward_context.in_profile_run = in_profile_run + + from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \ + get_token_dispatcher + dispatcher_name = get_dispatcher_name(ep_size, with_prefill) + dispatcher = get_token_dispatcher(dispatcher_name) + forward_context.token_dispatcher = dispatcher + + # NOTE: This cannot be set using set_forward_context + # due to multiple warmups before actual capturing + forward_context.capturing = False + + if num_tokens is None and attn_metadata is not None: + num_tokens = attn_metadata.num_actual_tokens + + dp_world_size = get_dp_group().world_size + if dp_world_size > 1 and forward_context.dp_metadata is not None: + max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item( + ) + else: + max_tokens_across_dp = num_tokens + + forward_context.max_tokens_across_dp = max_tokens_across_dp + + if num_tokens is not None: + if num_actual_tokens is None: + num_actual_tokens = num_tokens + tp_world_size = get_tensor_model_parallel_world_size() + # NOTE: token num which need to pad to when mc2 + forward_context.padded_num_tokens = math.ceil( + max_tokens_across_dp / tp_world_size) * tp_world_size + + if reserved_mc2_mask is not None: + mc2_mask = reserved_mc2_mask[:forward_context. + padded_num_tokens] + mc2_mask[:num_actual_tokens] = True + mc2_mask[num_actual_tokens:] = False + forward_context.mc2_mask = mc2_mask + + try: + yield + finally: + pass diff --git a/vllm_ascend/attention/__init__.py b/vllm_ascend/attention/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py new file mode 100644 index 0000000..a0e6334 --- /dev/null +++ b/vllm_ascend/attention/attention_mask.py @@ -0,0 +1,93 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + + +def _generate_attn_mask(max_seq_len, dtype): + # Construct lower triangle matrix. + mask_flag = torch.tril( + torch.ones((max_seq_len, max_seq_len), + dtype=torch.bool)).view(max_seq_len, max_seq_len) + # Create upper triangle matrix used to mark mask positions. + mask_flag = ~mask_flag + # Currently for fp16 dtype, the mask value should be set to -inf. + # TODO: Eliminate this part in the future. + if dtype == torch.float16: + mask_value = torch.finfo(torch.float32).min + else: + mask_value = 1 + attn_mask = torch.masked_fill(torch.zeros(size=(max_seq_len, max_seq_len)), + mask_flag, mask_value).to(dtype) + return attn_mask + + +class AttentionMaskBuilder: + + def __init__( + self, + max_seq_len: int, + dtype: torch.dtype, + ): + attn_mask = _generate_attn_mask(max_seq_len, dtype) + + self._seq_len_cached = attn_mask.shape[0] + self.attn_mask_cache = attn_mask + + @staticmethod + def get_mask_scale_factor(dtype: torch.dtype = torch.float16): + if dtype == torch.float16: + mask_scale_factor = 1 + elif dtype == torch.bfloat16: + mask_scale_factor = -10000 + else: + raise ValueError( + "The current operation now only supports data types: torch.float16 and " + "torch.bfloat16. Please ensure the input is of one of these types." + ) + return mask_scale_factor + + def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype, + device: torch.device): + self._update_attn_cache(max_seq_len, dtype) + return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous( + ).to(device) + + def get_splitfuse_attn_mask( + self, + seq_lens: torch.Tensor, + position: torch.Tensor, + dtype: torch.dtype, + device: torch.device, + ) -> torch.Tensor: + if dtype not in [torch.float16, torch.bfloat16]: + raise ValueError( + "splitfuse_attn_mask now only supports bf16 and fp16") + max_seq_len = max(seq_lens, default=0) + self._update_attn_cache(max_seq_len, dtype) + # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation + # is not the same. Fix this in the future when kernel is ready. + mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(dtype) + attn_mask = torch.index_select(self.attn_mask_cache, + dim=0, + index=position)[:, :max_seq_len] + attn_mask *= mask_scale_factor + return attn_mask.contiguous().to(device, non_blocking=True) + + def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): + if seqlen > self._seq_len_cached: + self._seq_len_cached = seqlen + self.attn_mask_cache = _generate_attn_mask(seqlen, dtype) + if self.attn_mask_cache.dtype != dtype: + self.attn_mask_cache = self.attn_mask_cache.to(dtype) diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py new file mode 100644 index 0000000..5460b94 --- /dev/null +++ b/vllm_ascend/attention/attention_v1.py @@ -0,0 +1,604 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from dataclasses import dataclass +from enum import Enum +from typing import List, Optional, Tuple, Type + +import torch +import torch.nn as nn +import torch_npu +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionType) +from vllm.attention.backends.utils import CommonAttentionState +from vllm.config import VllmConfig +from vllm.forward_context import ForwardContext, get_forward_context +from vllm.utils import cdiv, direct_register_custom_op +from vllm.v1.core.sched.output import SchedulerOutput + +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.ops.attention import vanilla_chunked_prefill +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, + nd_to_nz_2d, nd_to_nz_spec) +from vllm_ascend.worker.npu_input_batch import InputBatch + + +class AscendAttentionBackend(AttentionBackend): + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ASCEND" + + @staticmethod + def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: + return AscendAttentionBackendImpl + + @staticmethod + def get_metadata_cls() -> Type["AscendMetadata"]: + return AscendMetadata + + @staticmethod + def get_state_cls() -> Type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_builder_cls() -> type["AscendAttentionMetadataBuilder"]: + return AscendAttentionMetadataBuilder + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + if is_310p(): + return (2, num_blocks, num_kv_heads * head_size // 16, block_size, + 16) + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def get_bsh_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size, num_kv_heads * head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: List[torch.Tensor], + dst_kv_cache: List[torch.Tensor], + src_to_dst: torch.Tensor, + ) -> None: + src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1] + dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1] + src_indices = src_to_dst[:, 0] + dst_indices = src_to_dst[:, 1] + + dst_key_cache[dst_indices] = src_key_cache[src_indices].to( + dst_key_cache.device) + dst_value_cache[dst_indices] = src_value_cache[src_indices].to( + dst_key_cache.device) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + src_indices = src_to_dists[:, 0] + dst_indices = src_to_dists[:, 1] + + for kv_cache in kv_caches: + key_caches = kv_cache[0] + value_caches = kv_cache[1] + key_caches[dst_indices] = key_caches[src_indices] + value_caches[dst_indices] = value_caches[src_indices] + + +class AscendAttentionState(Enum): + PrefillNoCache = 0 + PrefillCacheHit = 1 + DecodeOnly = 2 + ChunkedPrefill = 3 + SpecDecoding = 4 + + +@dataclass +class AscendMetadata: + + # **************************** Basic Properties ************************** # + attn_mask: Optional[torch.Tensor] = None + # Current state of this attention run. + attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill + + # Number of tokens excluding padding. + num_actual_tokens: int = 0 + + # The sequence length per sequence. Sequence length means the computed + # tokens + new tokens (is None if it is a decoding). + # (batch_size,) + seq_lens: torch.Tensor = None + + query_start_loc: torch.Tensor = None + query_lens: torch.Tensor = None + # Maximum query length in the batch (None for decoding). + max_query_len: Optional[int] = None + + # ********************** KV Cache Related Properties ********************* # + # Block addresses per sequence (Seq id -> list of physical block). + # (batch_size, max_blocks_per_seq) + block_tables: torch.Tensor = None + + # The indices of the token slots that input tokens will be stored into. + # E.g., if `slot_mapping` is [35, 2, 17] and the block size is 16, the + # three tokens are stored in the 3rd slot in block 2, 2nd slot in block 0, + # and 1st slot in block 1, respectively. + # (num_tokens,) + slot_mapping: torch.Tensor = None + + # *************************** Other Properties *************************** # + enable_dbo_across_dp: bool = False + is_only_prefill: bool = False + + +class AscendAttentionMetadataBuilder: + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.device = device + self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len, + vllm_config.cache_config.block_size) + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + return False + + def build( + self, + common_attn_metadata: AscendCommonAttentionMetadata, + model: nn.Module, + ): + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[: + num_reqs + + 1] + + block_table = common_attn_metadata.block_table_tensor + block_table[:num_reqs, :self.max_num_blocks_per_req] = ( + block_table[:num_reqs]) + + query_lens = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + slot_mapping = common_attn_metadata.slot_mapping_cpu[: + num_actual_tokens].to( + self.device, + non_blocking= + True) + attn_mask = common_attn_metadata.attn_mask + attn_state = common_attn_metadata.attn_state + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[: + num_reqs + + 1] + query_start_loc = query_start_loc_cpu.to(self.device, + non_blocking=True) + + if is_310p(): + if attn_state == AscendAttentionState.PrefillNoCache: + mask_nz = nd_to_nz_2d(attn_mask) + attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + elif attn_state == AscendAttentionState.ChunkedPrefill: + mask_nz = nd_to_nz_spec(attn_mask) + attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + + attn_metadata = AscendMetadata( + num_actual_tokens=num_actual_tokens, + block_tables=block_table, + query_start_loc=query_start_loc, + query_lens=query_lens, + seq_lens=seq_lens, + max_query_len=common_attn_metadata.max_query_len, + slot_mapping=slot_mapping, + attn_mask=attn_mask, + attn_state=attn_state, + enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp, + is_only_prefill=common_attn_metadata.is_only_prefill) + return attn_metadata + + +class AscendAttentionBackendImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + **kwargs, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.hidden_size = self.num_heads * self.head_size + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, + dtype=torch.float32, + device="npu") + self.alibi_slopes = alibi_slopes + self.attn_type = attn_type + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.key_cache = None + self.value_cache = None + + def _repeat_kv(self, hidden_states: torch.Tensor, + n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, None, :, :].expand( + num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(num_key_value_heads * n_rep, slen, + head_dim) + + def _forward_prefill_no_cache( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: AscendMetadata, + output: Optional[torch.Tensor] = None, + num_tokens=0, + ) -> torch.Tensor: + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + + mask = attn_metadata.attn_mask + + if is_310p(): + # align q k v output tensors + query = aligned_16(query) + key = aligned_16(key) + value = aligned_16(value) + output = aligned_16(output) + # do reformat in case of broadcasted tensors + mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1) + mask = torch_npu.npu_format_cast(mask.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + + if self.sliding_window is not None and \ + attn_metadata.attn_mask.shape[0] > self.sliding_window: + + key = self._repeat_kv(key, self.num_heads // self.num_kv_heads) + value = self._repeat_kv(value, self.num_heads // self.num_kv_heads) + + output, _ = torch_npu.npu_fused_infer_attention_score( + query, + key, + value, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + input_layout="TND", + pre_tokens=self.sliding_window, + scale=self.scale, + actual_seq_lengths=attn_metadata.seq_lens, + actual_seq_lengths_kv=attn_metadata.seq_lens) + output = output.view(num_tokens, self.num_heads, self.head_size) + else: + torch_npu._npu_flash_attention(query=query, + key=key, + value=value, + mask=mask, + seq_len=attn_metadata.seq_lens, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + out=output) + assert output is not None + return output[:num_tokens, :, :] + + def _forward_prefill_cache_hit( + self, + query: torch.Tensor, + attn_metadata: AscendMetadata, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + + compress_mask = attn_metadata.attn_mask + batch_size = attn_metadata.query_lens.shape[0] + block_table = attn_metadata.block_tables[:batch_size, :] + + torch_npu._npu_flash_attention_qlens( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + block_table=block_table, + mask=compress_mask, + seq_len=attn_metadata.query_lens, + context_lens=attn_metadata.seq_lens, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + out=output) + return output + + def _forward_decode_only( + self, + query: torch.Tensor, + attn_metadata: AscendMetadata, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if is_310p(): + # seq_lens_tensor needs to be transferred to the device for 310P. + attn_metadata.seq_lens = \ + attn_metadata.seq_lens.to(device=query.device) + if self.sliding_window is not None: + batch_size = attn_metadata.seq_lens.shape[0] + block_size = 128 + query = query.view(batch_size, 1, self.num_heads * self.head_size) + key = self.key_cache + value = self.value_cache + if self.key_cache is not None and self.value_cache is not None: + block_size = self.key_cache.shape[1] + key = self.key_cache.flatten(2, 3).contiguous() + value = self.value_cache.flatten(2, 3).contiguous() + + output, _ = torch_npu.npu_fused_infer_attention_score( + query, + key, + value, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + input_layout="BSH", + block_size=block_size, + pre_tokens=self.sliding_window, + scale=self.scale, + block_table=attn_metadata.block_tables, + actual_seq_lengths=[1] * len(attn_metadata.seq_lens), + actual_seq_lengths_kv=attn_metadata.seq_lens) + + output = output.view(batch_size, self.num_heads, self.head_size) + else: + torch_npu._npu_paged_attention( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.block_tables, + context_lens=attn_metadata.seq_lens, + out=output) + return output + + def _forward_v1_style( + self, + query: torch.Tensor, + attn_metadata: AscendMetadata, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Use chunked prefill for head size 192 scenario, like deepseek + # paged_attention_splitfuse maybe crash at such scenario. + # TODO: vanilla path will be removed after the kernel support + # head_size 192 scenario. + if self.head_size == 192: + cu_seqlen_q = [0] + attn_metadata.query_lens.tolist() + cu_seqlen_k = [0] + attn_metadata.seq_lens.tolist() + cu_seqlen_q = torch.tensor(cu_seqlen_q, device=query.device) + cu_seqlen_k = torch.tensor(cu_seqlen_k, device=query.device) + cu_seqlen_q = torch.cumsum(cu_seqlen_q, dim=0) + cu_seqlen_k = torch.cumsum(cu_seqlen_k, dim=0) + max_seqlen_q = torch.max(attn_metadata.query_lens) + max_seqlen_k = torch.max(attn_metadata.seq_lens) + vanilla_chunked_prefill(output, query, self.key_cache, + self.value_cache, + attn_metadata.block_tables, cu_seqlen_q, + cu_seqlen_k, max_seqlen_q, max_seqlen_k, + self.scale, None, True) + return output + + # Use paged attention. + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + + if is_310p(): + # Do reformat in case of broadcasted tensors. + attn_metadata.attn_mask = \ + torch_npu.npu_format_cast(attn_metadata.attn_mask.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + attn_metadata.seq_lens = \ + attn_metadata.seq_lens.to(device=query.device) + + torch_npu._npu_paged_attention_splitfuse( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + mask=attn_metadata.attn_mask, + block_table=attn_metadata.block_tables, + seq_len=attn_metadata.query_lens, + context_lens=attn_metadata.seq_lens, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + out=output) + return output + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Tuple[torch.Tensor], + attn_metadata: AscendMetadata, + output: Optional[torch.Tensor] = None, + trace_flag: bool = True, + ) -> torch.Tensor: + """Forward pass with Ascend attention. + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + kv_cache: shape = [key_cache, value_cache] + key_cache = [num_blocks, block_size, + num_kv_heads, head_size] + value_cache = [num_blocks, block_size, + num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [batch_size * seq_len, num_heads, head_size] + """ + num_tokens = query.shape[0] + use_kv_cache_int8 = len( + kv_cache) > 0 and kv_cache[0].dtype == torch.int8 + if output is None: + output = torch.empty(num_tokens, + self.num_heads, + self.head_size, + dtype=query.dtype, + device=query.device) + ori_output = output + if trace_flag: + torch.ops.vllm.unified_ascend_attention_with_output( + query=query, + key=key, + value=value, + output=output, + layer_name=layer.layer_name) + + elif hasattr(layer, 'quant_method') and use_kv_cache_int8: + output = layer.quant_method.apply(layer, query, key, value, + kv_cache, attn_metadata, + self.attn_type, self.scale, + output) + + else: + if attn_metadata is None: + return output.view(num_tokens, self.hidden_size) + num_actual_tokens = attn_metadata.num_actual_tokens + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 + attn_type = self.attn_type + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "PallasAttentionBackendImpl") + # View q k v to BSH. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + # TODO: Remove this contiguous in the future. + value = value.contiguous() + + if len(kv_cache) > 1: + if self.key_cache is None: + self.key_cache, self.value_cache = kv_cache[0], kv_cache[1] + slots = attn_metadata.slot_mapping + torch_npu._npu_reshape_and_cache( + key=key[:num_actual_tokens], + value=value[:num_actual_tokens], + key_cache=self.key_cache, + value_cache=self.value_cache, + slot_indices=slots) + + # V0-Style scheduler situation. + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: + output = self._forward_prefill_no_cache( + query, key, value, attn_metadata, output, num_tokens) + elif attn_metadata.attn_state == \ + AscendAttentionState.PrefillCacheHit: + output = self._forward_prefill_cache_hit( + query, attn_metadata, output) + elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: + output = self._forward_decode_only(query, attn_metadata, + output) + # Normal V1 situation. + else: + output = self._forward_v1_style(query, attn_metadata, output) + + # to make in-place change to the output tensor + if hasattr(layer, 'quant_method') and use_kv_cache_int8: + output = output.view(num_tokens, self.num_heads, self.head_size) + ori_output[:, :, :] = output[:num_tokens, :, :] + return output.view(num_tokens, self.hidden_size) + + +def unified_ascend_attention_with_output( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.attn_metadata + self = forward_context.no_compile_layers[layer_name] + kv_cache = self.kv_cache[forward_context.virtual_engine] + self.impl.forward(self, + query, + key, + value, + kv_cache, + attn_metadata, + output, + trace_flag=False) + return + + +def unified_attention_with_output_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_ascend_attention_with_output", + op_func=unified_ascend_attention_with_output, + mutates_args=["output"], + fake_impl=unified_attention_with_output_fake, + dispatch_key="PrivateUse1", +) diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py new file mode 100644 index 0000000..a386f63 --- /dev/null +++ b/vllm_ascend/attention/mla_v1.py @@ -0,0 +1,1050 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar + +import torch +import torch_npu +from torch import nn +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadata, + MLAAttentionImpl) +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.utils import cdiv, round_down + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, + split_decodes_and_prefills) +from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig +from vllm_ascend.multistream.context import get_multistream_comm_context +from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn +from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla +from vllm_ascend.utils import npu_prefetch +from vllm_ascend.worker.npu_input_batch import InputBatch + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + + +class AscendMLABackend(AttentionBackend): + + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ASCEND_MLA" + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return AscendMLAMetadata + + @staticmethod + def get_builder_cls(): + return AscendMLAMetadataBuilder + + @staticmethod + def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, + head_size: int) -> tuple[int, ...]: + return (num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def get_impl_cls() -> Type["MLAAttentionImpl"]: + return AscendMLAImpl + + +@dataclass +class AscendMLAPrefillMetadata: + """ Prefill Specific Metadata for Ascend""" + + @dataclass + class ChunkedContextMetadata: + # New for MLA (compared to FlashAttention) + # For handling chunked prefill + cu_seq_lens: torch.Tensor + starts: torch.Tensor + seq_tot: list[int] + max_seq_lens: list[int] + workspace: torch.Tensor + chunk_seq_lens: torch.Tensor + + attn_mask: torch.Tensor + query_lens: list[int] + seq_lens: list[int] + context_lens: torch.Tensor + input_positions: torch.Tensor + query_start_loc: torch.Tensor + block_table: torch.Tensor + max_query_len: int + max_seq_lens: int + chunked_context: Optional[ChunkedContextMetadata] = None + sin: torch.Tensor = None + cos: torch.Tensor = None + + +@dataclass +class AscendMLADecodeMetadata: + # Input positions for rotrary embeddings since for MLA the rotary + # position embeddings are applied inside the attention backend + input_positions: torch.Tensor + block_table: torch.Tensor + seq_lens: torch.Tensor + max_seq_lens: int + seq_lens_list: list[int] + actual_seq_lengths_q: Optional[list[int]] = None + attn_mask: Optional[torch.Tensor] = None + sin: torch.Tensor = None + cos: torch.Tensor = None + + +@dataclass +class AscendMLAMetadata: + """Metadata for MLACommon. + + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + num_actual_tokens: int # Number of tokens excluding padding. + slot_mapping: torch.Tensor + query_start_loc: torch.Tensor + seq_lens: torch.Tensor + block_tables: torch.Tensor + + # New for MLA (compared to FlashAttention) + # For handling prefill decode split + num_decodes: int + num_decode_tokens: int + num_prefills: int + + # For logging. + num_input_tokens: int = 0 # Number of tokens including padding. + + query_lens: Optional[list[int]] = None + # The dimension of the attention heads + head_dim: Optional[int] = None + attn_mask: torch.Tensor = None + # chunked prefill by default if no attn_states passed + attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill + + decode: Optional[AscendMLADecodeMetadata] = None + prefill: Optional[AscendMLAPrefillMetadata] = None + enable_dbo_across_dp: bool = False + + def __post_init__(self): + pass + # supported_head_sizes = AscendMLABackend.get_supported_head_sizes() + # if self.head_dim is not None and self.head_dim \ + # not in supported_head_sizes: + # raise ValueError( + # f"Only {supported_head_sizes} are supported for head_dim,", + # f"received {self.head_dim}.") + + def split_metadata_for_multistream( + self, + ms_split_config: MSAttentionMetadataSplitConfig, + ) -> list["AscendMLAMetadata"]: + """Split metadata for multi-stream with AscendMLAMetadata""" + return model_input_split_v1_mla_attn( + ms_split_config=ms_split_config, + attn_metadata=self, + _metadata_cls=AscendMLAMetadata, + ) + + +M = TypeVar("M", bound=AscendMLAMetadata) + + +class AscendMLAMetadataBuilder: + """ + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + + # _attn_mask_builder = None + def __init__(self, + vllm_config: VllmConfig, + device: torch.device, + metadata_cls: Optional[AscendMLAMetadata] = None): + self.metadata_cls: Optional[AscendMLAMetadata] = metadata_cls \ + if metadata_cls is not None else AscendMLAMetadata # type: ignore + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.device = device + scheduler_config = vllm_config.scheduler_config + self.block_size = vllm_config.cache_config.block_size + self.max_blocks = (vllm_config.model_config.max_model_len + + self.block_size - 1) // self.block_size + self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled + + self.decode_threshold = 1 + + if self.chunked_prefill_enabled: + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * self.model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * self.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * self.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) + self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim + self.cos_cache = None + self.sin_cache = None + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + # We now want to reorder the batch so that the "decode" requests are at + # the front and the "prefill" requests are at the using the least amount + # swaps possible. (NOTE for now we loosely use "decode" to mean requests + # where attention is likely memory-bound and "prefill" to mean requests + # where attention is likely compute-bound, TODO(lucas): figure out a + # better naming here) + decodes = [] + prefills = [] + + for i, req_id in enumerate(input_batch.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + if num_tokens <= self.decode_threshold: + decodes.append(i) + else: + prefills.append(i) + + # We hope that this is fairly minimal since decodes + # should be around for a number of iterations so hopefully they are + # relatively stationary (and new request are generally appended to the + # persistent batch so already should be at the back) + # To achieve this we loop over the decodes in descending order and + # the prefills in ascending order. We swap decodes from the "back" + # i.e. past where the last decode should be in the reodorered with + # prefills from the front of the batch. + # `decodes` and `prefills` are already in ascending order just based on + # the above loop + num_decodes = len(decodes) + num_prefills = len(prefills) + first_prefill = 0 + modified_batch = False + + for i in range(1, min(num_decodes, num_prefills) + 1): + # If the decode is at the "back" of the batch, i, we can swap it + # with the prefill closest to the front of the batch + if decodes[num_decodes - i] >= num_decodes: + input_batch.swap_states(prefills[first_prefill], + decodes[num_decodes - i]) + first_prefill += 1 + modified_batch = True + else: + break + + # Save for next `build` call + # TODO(lucas): this is a bit of a hack, we should probably have a + # better way of doing this + return modified_batch + + def build( + self, + common_attn_metadata: AscendCommonAttentionMetadata, + model: nn.Module, + ) -> AscendMLAMetadata: + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + query_start_loc = common_attn_metadata.query_start_loc + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \ + split_decodes_and_prefills(common_attn_metadata, decode_threshold=self.decode_threshold) + assert num_decodes + num_prefills == num_reqs + assert num_decode_tokens + num_prefill_tokens == num_actual_tokens + + # Note(simon): be careful about the CPU <> GPU memory movement in this + # function. We should avoid GPU -> CPU sync as much as possible because + # it blocks on all previous kernels. + device = self.device + + block_table = (common_attn_metadata.block_table_tensor[:num_reqs]) + slot_mapping = common_attn_metadata.slot_mapping_cpu[: + num_actual_tokens].to( + device, + non_blocking= + True) + input_positions = common_attn_metadata.positions[: + num_actual_tokens].long( + ) + + if self.cos_cache is None: + self.cos_cache = model.model.layers[ + 0].self_attn.rotary_emb.cos_cached + self.sin_cache = model.model.layers[ + 0].self_attn.rotary_emb.sin_cached + if self.cos_cache.dtype != self.model_config.dtype: # type: ignore + self.cos_cache = self.cos_cache.to( # type: ignore + self.model_config.dtype) # type: ignore + self.sin_cache = self.sin_cache.to( # type: ignore + self.model_config.dtype) # type: ignore + + query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + query_lens = query_seq_lens_cpu[:num_reqs] + seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + num_computed_tokens_cpu = (seq_lens - query_lens) + + prefill_metadata = None + chunked_context_metadata = None + if num_prefills > 0: + reqs_start = num_decodes # prefill_start + tokens_start = num_decode_tokens + max_query_len = query_lens[reqs_start:].max().item() + max_seq_lens = seq_lens[reqs_start:].max().item() + prefill_query_start_loc = query_start_loc[ + reqs_start:] - query_start_loc[reqs_start] + + context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs] + max_context_len_cpu = context_lens_cpu.max().item() + num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item() + if self.chunked_prefill_enabled and max_context_len_cpu > 0: + max_context_chunk = (self.chunked_prefill_workspace_size // + num_prefills_with_context_cpu) + max_context_chunk = round_down(max_context_chunk, + self.block_size) + + assert max_context_chunk > 0 + num_chunks = cdiv(max_context_len_cpu, max_context_chunk) + chunk_starts = torch.arange(num_chunks, dtype=torch.int32) \ + .unsqueeze(1).expand(-1, num_prefills) * max_context_chunk + chunk_ends = torch.min(context_lens_cpu.unsqueeze(0), + chunk_starts + max_context_chunk) + chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0) + cu_seq_lens_cpu = torch.zeros(num_chunks, + num_prefills + 1, + dtype=torch.int32, + pin_memory=True) + torch.cumsum(chunk_seq_lens, + dim=1, + out=cu_seq_lens_cpu[:, 1:], + dtype=torch.int32) + chunked_context_metadata = \ + AscendMLAPrefillMetadata.ChunkedContextMetadata( + cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True), + starts=chunk_starts.to(device, non_blocking=True), + seq_tot=chunk_seq_lens.sum(dim=1).tolist(), + max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), + chunk_seq_lens=chunk_seq_lens, + workspace=self.chunked_prefill_workspace, + ) + prefill_input_positions = input_positions[tokens_start:] + cos = self.cos_cache[ + prefill_input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + sin = self.sin_cache[ + prefill_input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + prefill_metadata = AscendMLAPrefillMetadata( + attn_mask=common_attn_metadata.attn_mask, + query_lens=query_lens[reqs_start:], + seq_lens=seq_lens, + context_lens=seq_lens[reqs_start:], + input_positions=prefill_input_positions, + block_table=block_table[reqs_start:, ...], + max_query_len=max_query_len, + max_seq_lens=max_seq_lens, + query_start_loc=prefill_query_start_loc, + chunked_context=chunked_context_metadata, + sin=sin, + cos=cos, + ) + + decode_metadata = None + if num_decodes > 0: + actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist() + max_seq_lens = seq_lens[:num_decodes].max().item() + seq_lens = seq_lens[:num_decode_tokens] + input_positions = input_positions[:num_decode_tokens] + block_table = block_table[:num_decode_tokens, ...] + seq_lens_list = seq_lens.tolist() + + cos = self.cos_cache[input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + sin = self.sin_cache[input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + + decode_metadata = AscendMLADecodeMetadata( + input_positions=input_positions, + block_table=block_table, + seq_lens=seq_lens, + seq_lens_list=seq_lens_list, + max_seq_lens=max_seq_lens, + attn_mask=common_attn_metadata.spec_attn_mask, + actual_seq_lengths_q=actual_seq_lengths_q, + sin=sin, + cos=cos) + + return self.metadata_cls( # type: ignore + num_actual_tokens=num_actual_tokens, + query_lens=query_lens.tolist(), + slot_mapping=slot_mapping, + head_dim=self.model_config.get_head_size(), + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + num_prefills=num_prefills, + attn_mask=common_attn_metadata.attn_mask, + attn_state=common_attn_metadata.attn_state, + prefill=prefill_metadata, + decode=decode_metadata, + query_start_loc=query_start_loc, + block_tables=block_table, + seq_lens=seq_lens, + enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp, + ) + + +class DecodeMLAPreprocessResult(NamedTuple): + ql_nope: Optional[torch.Tensor] = None + q_pe: Optional[torch.Tensor] = None + k_nope: Optional[torch.Tensor] = None + k_pe: Optional[torch.Tensor] = None + + +class PrefillMLAPreprocessResult(NamedTuple): + q_nope: Optional[torch.Tensor] = None + q_pe: Optional[torch.Tensor] = None + k_nope: Optional[torch.Tensor] = None + k_pe: Optional[torch.Tensor] = None + value: Optional[torch.Tensor] = None + + +class AscendMLAImpl(MLAAttentionImpl): + """ + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + **kwargs, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + + # MLA Args + self.q_lora_rank = kwargs['q_lora_rank'] + self.kv_lora_rank = kwargs['kv_lora_rank'] + self.qk_nope_head_dim = kwargs['qk_nope_head_dim'] + self.qk_rope_head_dim = kwargs['qk_rope_head_dim'] + self.qk_head_dim = kwargs['qk_head_dim'] + self.v_head_dim = kwargs['v_head_dim'] + self.rotary_emb = kwargs['rotary_emb'] + self.q_proj = kwargs['q_proj'] + self.kv_b_proj = kwargs['kv_b_proj'] + self.o_proj = kwargs['o_proj'] + self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None) + self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None) + self.q_a_proj = kwargs.get('q_a_proj', None) + self.q_a_layernorm = kwargs.get('q_a_layernorm', None) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.tp_size = get_tensor_model_parallel_world_size() + + ascend_config = get_ascend_config() + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + self.enable_prefetch = ascend_config.enable_prefetch + self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz + self.chunked_prefill_for_mla = ascend_config.chunked_prefill_for_mla + + vllm_config = get_current_vllm_config() + self.ring_mla_mask_size = 512 + self.prefill_mask = None + + # Adapt torch air graph mode with spec decoding. + speculative_config = vllm_config.speculative_config + if speculative_config is not None: + self.spec_token_num = speculative_config.num_speculative_tokens + assert self.spec_token_num > 0 + + def _v_up_proj(self, x): + # Convert from (B, N, L) to (N, B, L) + x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) + x = torch.bmm(x, self.W_UV) + # Convert from (N, B, V) to (B, N * V) + x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + return x + + # Return `ql_nope`, `q_pe` + def _q_proj_and_k_up_proj(self, x): + q_nope, q_pe = self.q_proj(x)[0]\ + .view(-1, self.num_heads, self.qk_head_dim)\ + .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + return ql_nope.transpose(0, 1), q_pe + + def process_weights_after_loading(self, act_dtype: torch.dtype): + + def get_layer_weight(layer): + WEIGHT_NAMES = ("weight", "qweight", "weight_packed") + for attr in WEIGHT_NAMES: + if hasattr(layer, attr): + return getattr(layer, attr) + raise AttributeError( + f"Layer '{layer}' has no recognized weight attribute:" + f" {WEIGHT_NAMES}.") + + def get_and_maybe_dequant_weights(layer: LinearBase): + if not isinstance(layer.quant_method, UnquantizedLinearMethod): + # NOTE: This should only be used offline, since it's O(N^3) + eye = torch.eye(layer.input_size_per_partition, + dtype=act_dtype, + device=get_layer_weight(layer).device) + dequant_weights = layer.quant_method.apply(layer, + eye, + bias=None) + del eye + # standardize to (output, input) + return dequant_weights.T + return layer.weight + + # we currently do not have quantized bmm's which are needed for + # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # the bmm's in 16-bit, the extra memory overhead of this is fairly low + kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T + assert kv_b_proj_weight.shape == ( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), ( + f"{kv_b_proj_weight.shape=}, " + f"{self.kv_lora_rank=}, " + f"{self.num_heads=}, " + f"{self.qk_nope_head_dim=}, " + f"{self.v_head_dim=}") + kv_b_proj_weight = kv_b_proj_weight.view( + self.kv_lora_rank, + self.num_heads, + self.qk_nope_head_dim + self.v_head_dim, + ) + + W_UK, W_UV = kv_b_proj_weight.split( + [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + # Convert from (L, N, V) to (N, L, V) + self.W_UV = W_UV.transpose(0, 1).contiguous() + # Convert from (L, N, P) to (N, P, L) + self.W_UK_T = W_UK.permute(1, 2, 0).contiguous() + + # Waiting for BMM NZ support + # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) + # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) + + def _compute_prefill_context( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], + rope_dim: int, + attn_metadata: AscendMLAMetadata, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + ): + assert len(kv_c_and_k_pe_cache) > 1 + prefill_metadata = attn_metadata.prefill + if prefill_metadata is None or prefill_metadata.chunked_context is None: + return prefix_output, prefix_lse + + iters = len(prefill_metadata.chunked_context.seq_tot) + + seq_len1 = torch.tensor(prefill_metadata.query_lens, dtype=torch.int32) + cache_kv_c = kv_c_and_k_pe_cache[0] + cache_k_pe = kv_c_and_k_pe_cache[1] + num_heads = cache_k_pe.size(2) + latent_kv_dim = kv_c_and_k_pe_cache[0].size(-1) + for i in range(iters): + toks = prefill_metadata.chunked_context.seq_tot[i] + + seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i] + seq_len = torch.stack([seq_len1, seq_len2]) + kv_c_normed = torch.empty(toks, + num_heads, + latent_kv_dim, + dtype=q_nope.dtype, + device=q_nope.device) + k_pe = torch.empty(toks, + num_heads, + rope_dim, + dtype=q_nope.dtype, + device=q_nope.device) + + torch_npu.atb.npu_paged_cache_load( + cache_kv_c, + cache_k_pe, + prefill_metadata.block_table, + seq_len2.to(q_nope.device), + seq_starts=prefill_metadata.chunked_context.starts[i], + key=kv_c_normed, + value=k_pe, + ) + + kv_c_normed = kv_c_normed.squeeze() + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv_nope\ + .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.expand((*k_nope.shape[:-1], -1)) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=v, + mask=self.prefill_mask, + seqlen=seq_len, + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=prefix_output, + prev_lse=prefix_lse, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="no_mask", + input_layout="type_bsnd", + calc_type="calc_type_default", + output=prefix_output, + softmax_lse=prefix_lse) + return prefix_output, prefix_lse + + def _forward_prefill( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + value: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], + attn_metadata: AscendMLAMetadata, + ) -> torch.Tensor: + assert attn_metadata.prefill is not None + assert len(kv_c_and_k_pe_cache) > 1 + num_tokens = q_nope.size(0) + attn_output = torch.empty(num_tokens, + self.num_heads, + self.v_head_dim, + dtype=q_nope.dtype, + device=q_nope.device) + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: + query = torch.cat((q_nope, q_pe), dim=-1) + key = torch.cat((k_nope, k_pe), dim=-1) + torch_npu._npu_flash_attention( + query=query, + key=key, + value=value, + mask=attn_metadata.attn_mask, + seq_len=attn_metadata.prefill.context_lens, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_heads, + out=attn_output) + elif self.chunked_prefill_for_mla: + attn_lse = torch.empty(self.num_heads, + num_tokens, + dtype=torch.float32, + device=q_nope.device) + if self.prefill_mask is None: + self.prefill_mask = torch.triu( + torch.ones(self.ring_mla_mask_size, + self.ring_mla_mask_size, + device=q_nope.device, + dtype=q_nope.dtype), 1) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=value, + mask=self.prefill_mask, + seqlen=torch.tensor(attn_metadata.prefill.query_lens, + dtype=torch.int32), + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=None, + prev_lse=None, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="mask_type_triu", + input_layout="type_bsnd", + calc_type="calc_type_first_ring", + output=attn_output, + softmax_lse=attn_lse) + attn_output, attn_lse = self._compute_prefill_context( \ + q_nope, q_pe, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse) + else: + query = torch.cat((q_nope, q_pe), dim=-1) + attn_output_torch = torch.empty(num_tokens, + self.num_heads * self.v_head_dim, + dtype=query.dtype, + device=query.device) + # current requests is chunked in prefill, disable flash attention with chunked prefill + vanilla_chunked_prefill_mla( + output=attn_output_torch, + query=query, + kv_cache=kv_c_and_k_pe_cache, + block_tables=attn_metadata.prefill.block_table, + query_lens=attn_metadata.prefill.query_lens, + context_lens=attn_metadata.prefill.context_lens, + kv_b_proj=self.kv_b_proj, + max_query_len=attn_metadata.prefill.max_query_len, + max_context_len=attn_metadata.prefill.max_seq_lens, + nope_dim=self.qk_nope_head_dim, + rope_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + scale=self.scale, + alibi_slopes=None, + causal=True) + + attn_output = attn_output.reshape( + [num_tokens, self.num_heads * self.v_head_dim]) + if attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit + ] and not self.chunked_prefill_for_mla: + attn_output = attn_output_torch + return attn_output + + def exec_kv_decode( + self, + kv_no_split: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + B = kv_no_split.shape[0] + N = self.num_kv_heads + S = 1 + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv_no_split = kv_no_split.view( + B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" + k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + kv_no_split, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + ) + return k_pe, k_nope + + def exec_kv_prefill( + self, + kv_no_split: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + B = kv_no_split.shape[0] + N = self.num_kv_heads + S = 1 + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv_no_split = kv_no_split.view( + B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA" + _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache( + kv_no_split, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + is_output_kv=True, + ) + return k_pe, k_nope + + def rope_single( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + B, N, D = x.shape + S = 1 + x = x.view(B, N, S, D) + x = torch_npu.npu_interleave_rope(x, cos, sin) + return x.view(B, N, D) + + def _forward_decode( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + block_size: int, + attn_metadata: AscendMLAMetadata, + ) -> torch.Tensor: + decode_meta = attn_metadata.decode + assert decode_meta is not None + num_tokens = q_nope.size(0) + # shape of knope/k_pe for npu graph mode should be: + # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] + actual_seq_lengths = None + if self.enable_kv_nz: + k_nope = k_nope.view(-1, self.num_kv_heads, + self.kv_lora_rank // 16, block_size, 16) + k_pe = k_pe.view(-1, self.num_kv_heads, + self.qk_rope_head_dim // 16, block_size, 16) + input_layout = "BSND" + else: + k_nope = k_nope.view(-1, self.num_kv_heads, block_size, + self.kv_lora_rank) + k_pe = k_pe.view(-1, self.num_kv_heads, block_size, + self.qk_rope_head_dim) + input_layout = "BNSD" + + if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: + assert num_tokens % self.spec_token_num == 0 + input_layout = "TND" + # [bs * q_seq_len, num_heads_per_rank, dim] + q_nope = q_nope.view(num_tokens, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, -1) + sparse_mode = 3 + spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore + actual_seq_lengths = decode_meta.actual_seq_lengths_q + else: + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + sparse_mode = 0 + spec_attn_mask = None + + attn_output, _ = torch_npu.npu_fused_infer_attention_score( + q_nope, + k_nope, + k_nope, + query_rope=q_pe, + key_rope=k_pe, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + input_layout=input_layout, + atten_mask=spec_attn_mask, + sparse_mode=sparse_mode, + scale=self.scale, + antiquant_mode=0, + antiquant_scale=None, + block_table=decode_meta.block_table, + block_size=block_size, + actual_seq_lengths_kv=decode_meta.seq_lens_list, + actual_seq_lengths=actual_seq_lengths) + + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is None: + return self._v_up_proj(attn_output) + else: + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + return self._v_up_proj(attn_output) + + def _mla_preprocess(self, hidden_states, kv_cache, attn_metadata, + need_gather_q_kv): + # MLA Preprocess: + # 1. Perform q_a_proj and q_a_layernorm to obtain q_c + # 2. Perform kv_a_proj_with_mqa to obtain kv_no_split + # 3. If need_gather_q_kv, perform all_gather. + # 4. Preprocess decode tokens, write kv cache and get: + # decode_ql_nope, decode_q_pe, decode_k_pe, decode_k_nope + # 5. Preprocess prefill tokens, write kv cache and get: + # prefill_q_nope, prefill_q_pe, prefill_k_nope, prefill_k_pe, prefill_value + has_decode = attn_metadata.num_decodes > 0 + has_prefill = attn_metadata.num_prefills > 0 + num_decode_tokens = attn_metadata.num_decode_tokens + num_actual_tokens = attn_metadata.num_actual_tokens + if self.q_a_proj is not None: + npu_prefetch(self.q_a_proj.weight, + hidden_states, + enabled=self.enable_prefetch) + ckq = self.q_a_proj(hidden_states)[0] + q_c = self.q_a_layernorm(ckq) + else: + q_c = hidden_states + + kv_no_split = self.kv_a_proj_with_mqa(hidden_states)[0] + # Process for shared_expert_dp + if need_gather_q_kv: + q_c = get_tp_group().all_gather(q_c, 0) + kv_no_split = get_tp_group().all_gather(kv_no_split, 0) + decode_preprocess_res = None + prefill_preprocess_res = None + # Preprocess for decode tokens + if has_decode: + decode_q_c = q_c[:num_decode_tokens] + cos = attn_metadata.decode.cos + sin = attn_metadata.decode.sin + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_q_c) + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + decode_slots = attn_metadata.slot_mapping[:num_decode_tokens] + decode_kv_no_split = kv_no_split[:num_decode_tokens] + decode_k_pe, decode_k_nope = self.exec_kv_decode( + decode_kv_no_split, cos, sin, kv_cache, decode_slots) + decode_preprocess_res = DecodeMLAPreprocessResult( + decode_ql_nope, decode_q_pe, decode_k_nope, decode_k_pe) + # Preprocess for prefill tokens + if has_prefill: + prefill_kv_no_split = kv_no_split[ + num_decode_tokens:num_actual_tokens] + prefill_q_c = q_c[num_decode_tokens:num_actual_tokens] + prefill_q = self.q_proj(prefill_q_c)[0]\ + .view(-1, self.num_heads, self.qk_head_dim) + prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:] + prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] + cos = attn_metadata.prefill.cos + sin = attn_metadata.prefill.sin + prefill_slots = attn_metadata.slot_mapping[ + num_decode_tokens:num_actual_tokens] + prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) + prefill_k_pe, prefill_k_c_normed = self.exec_kv_prefill( + prefill_kv_no_split, cos, sin, kv_cache, prefill_slots) + prefill_k_pe = prefill_k_pe.view(prefill_q_c.shape[0], + self.num_kv_heads, -1) + prefill_k_nope, prefill_value = self.kv_b_proj( + prefill_k_c_normed)[0].view( + -1, self.num_heads, + self.qk_nope_head_dim + self.v_head_dim).split( + [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + prefill_k_pe = prefill_k_pe.expand( + (*prefill_k_nope.shape[:-1], -1)) + prefill_preprocess_res = PrefillMLAPreprocessResult( + prefill_q_nope, prefill_q_pe, prefill_k_nope, prefill_k_pe, + prefill_value) + return decode_preprocess_res, prefill_preprocess_res + + def forward( + self, + hidden_states: torch.Tensor, # query in unified attn + kv_cache: Tuple[torch.Tensor], + attn_metadata: M, + need_gather_q_kv: bool = False, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + assert output is not None, "Output tensor must be provided." + if attn_metadata is None: + # Profiling run. + return output + num_actual_tokens = attn_metadata.num_actual_tokens + assert attn_metadata.num_decodes is not None and \ + attn_metadata.num_prefills is not None and \ + attn_metadata.num_decode_tokens is not None + num_decode_tokens = attn_metadata.num_decode_tokens + # Inputs and outputs may be padded for CUDA graphs + output_padded = output + output = output[:num_actual_tokens, ...] + o_proj_input_shape = (num_actual_tokens, + self.num_heads * self.v_head_dim) + o_proj_input = torch.empty(o_proj_input_shape, + dtype=hidden_states.dtype, + device=hidden_states.device) + + # MLA Preprocess + decode_preprocess_res, prefill_preprocess_res = self._mla_preprocess( + hidden_states, kv_cache, attn_metadata, need_gather_q_kv) + + if decode_preprocess_res is not None: + # MLA Preprocess for decoding + output_decode = self._forward_decode(decode_preprocess_res.ql_nope, + decode_preprocess_res.q_pe, + decode_preprocess_res.k_nope, + decode_preprocess_res.k_pe, + kv_cache[0].shape[1], + attn_metadata) + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is not None: + with torch.npu.stream(current_ms_metadata.comm_stream): + o_proj_input[:num_decode_tokens] = output_decode + current_ms_metadata.after_comm_event.record() + else: + o_proj_input[:num_decode_tokens] = output_decode + + if prefill_preprocess_res is not None: + # FIX: aicore move should be also placed on the comm stream in dbo, + # otherwise it may affect the accuracy + # TODO: use an elegant way to overlap + output_prefill = self._forward_prefill( + prefill_preprocess_res.q_nope, prefill_preprocess_res.q_pe, + prefill_preprocess_res.k_nope, prefill_preprocess_res.k_pe, + prefill_preprocess_res.value, kv_cache, attn_metadata) + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is not None: + with torch.npu.stream(current_ms_metadata.comm_stream): + o_proj_input[num_decode_tokens:] = output_prefill + current_ms_metadata.after_comm_event.record() + else: + o_proj_input[num_decode_tokens:] = output_prefill + # O proj + current_ms_metadata = get_multistream_comm_context() + MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 + if current_ms_metadata is None: + npu_prefetch(self.o_proj.weight, + o_proj_input, + max_size=MAX_O_PROJ_PREFETCH_SIZE, + enabled=self.enable_prefetch) + + output[...] = self.o_proj( + o_proj_input, + is_prefill=prefill_preprocess_res is not None, + is_force_scatter=self.enable_shared_expert_dp)[0] + else: + with torch.npu.stream(current_ms_metadata.comm_stream): + npu_prefetch(self.o_proj.weight, + o_proj_input, + max_size=MAX_O_PROJ_PREFETCH_SIZE, + enabled=self.enable_prefetch) + output[...] = self.o_proj( + o_proj_input, + is_prefill=prefill_preprocess_res is not None, + is_force_scatter=self.enable_shared_expert_dp)[0] + current_ms_metadata.after_comm_event.record() + del o_proj_input + return output_padded diff --git a/vllm_ascend/attention/utils.py b/vllm_ascend/attention/utils.py new file mode 100644 index 0000000..2ef537f --- /dev/null +++ b/vllm_ascend/attention/utils.py @@ -0,0 +1,95 @@ +from dataclasses import dataclass +from typing import Any + +import torch + + +@dataclass +class AscendCommonAttentionMetadata: + """ + Per-batch attention metadata, shared across layers and backends. + AttentionMetadataBuilder instances use it to construct per-layer metadata. + + For many of the tensors we keep both GPU and CPU versions. + """ + + query_start_loc: torch.Tensor + query_start_loc_cpu: torch.Tensor + """(batch_size + 1,), the start location of each request in query Tensor""" + + seq_lens_cpu: torch.Tensor + """(batch_size,), the length of each request including both computed tokens + and newly scheduled tokens""" + + num_reqs: int + """Number of requests""" + num_actual_tokens: int + """Total number of tokens in batch""" + + max_query_len: int + """Max token number of request in batch""" + + decode_token_per_req: int + """decode token number per request""" + + block_table_tensor: torch.Tensor + + slot_mapping_cpu: torch.Tensor + + actual_seq_lengths_q: list[int] + + positions: torch.Tensor = None + + attn_mask: torch.Tensor = None + + spec_attn_mask: torch.Tensor = None + + attn_state: Any = None + + enable_dbo_across_dp: bool = False + + is_only_prefill: bool = False + + graph_pad_size: int = -1 + + +def split_decodes_and_prefills( + common_attn_metadata: AscendCommonAttentionMetadata, + decode_threshold: int = 1, +) -> tuple[int, int, int, int]: + """ + Assuming a reordered batch, finds the boundary between prefill and decode + requests. + + Args: + common_attn_metadata: AscendCommonAttentionMetadata object containing the + batch metadata. + decode_threshold: The maximum query length to be considered a decode. + + Returns: + num_decodes: The number of decode requests. + num_prefills: The number of prefill requests. + num_decode_tokens: The number of tokens in the decode requests. + num_prefill_tokens: The number of tokens in the prefill requests. + """ + max_query_len = common_attn_metadata.max_query_len + num_reqs = common_attn_metadata.num_reqs + num_tokens = common_attn_metadata.num_actual_tokens + query_start_loc = common_attn_metadata.query_start_loc_cpu + + if max_query_len <= decode_threshold: + return num_reqs, 0, num_tokens, 0 + + query_lens = query_start_loc[1:] - query_start_loc[:-1] + is_prefill = query_lens > decode_threshold + if not torch.any(is_prefill): + return num_reqs, 0, num_tokens, 0 + + first_prefill = is_prefill.int().argmax(dim=-1).item() + assert torch.all(query_lens[first_prefill:] >= decode_threshold) + assert torch.all(query_lens[:first_prefill] <= decode_threshold) + num_decodes = first_prefill + num_prefills = num_reqs - num_decodes + num_decode_tokens = query_start_loc[first_prefill].item() + num_prefill_tokens = num_tokens - num_decode_tokens + return (num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens) diff --git a/vllm_ascend/compilation/__init__.py b/vllm_ascend/compilation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py new file mode 100644 index 0000000..f8dfc24 --- /dev/null +++ b/vllm_ascend/compilation/acl_graph.py @@ -0,0 +1,185 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import dataclasses +from contextlib import ExitStack +from typing import Any, Callable, Optional +from unittest.mock import patch + +import torch +import vllm.envs as envs +from vllm.compilation.counter import compilation_counter +from vllm.compilation.cuda_graph import CUDAGraphOptions +from vllm.compilation.monitor import validate_cudagraph_capturing_enabled +from vllm.config import CUDAGraphMode, VllmConfig +from vllm.forward_context import BatchDescriptor, get_forward_context +from vllm.logger import logger +from vllm.platforms import current_platform +from vllm.utils import weak_ref_tensors + + +@dataclasses.dataclass +class ACLGraphEntry: + batch_descriptor: BatchDescriptor + aclgraph: Optional[torch.npu.NPUGraph] = None + output: Optional[Any] = None + + # for aclgraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[list[int]] = None + + +class ACLGraphWrapper: + """Wraps a runnable to add acl graph capturing and replaying ability. And + provide attribute access to the underlying `runnable` via `__getattr__`. + + The workflow of this wrapper in the aclgraph dispatching is as follows: + 1. At initialization, a runtime mode is assigned to the wrapper (FULL or + PIECEWISE). + 2. At runtime, the wrapper receives a runtime_mode and a + batch_descriptor(key) from the forward context and blindly trust them + for aclgraph dispatching. + 3. If runtime_mode is NONE or runtime_mode does not match the mode of the + wrapper, just call the runnable directly. + 4. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, + the wrapper will perform aclgraph capture(if key does not exist, create + a new entry and cache it) or replay (if key exists in the cache). + + Note: ACLGraphWrapper does not store persistent buffers or copy any + runtime inputs into that buffers for replay. We assume implementing them + is done outside of the wrapper. That is because we do not make any + assumption on the dynamic shape (batch size) of the runtime inputs, as a + trade-off for staying orthogonal to compilation logic. Nevertheless, + tracing and checking the input addresses to be consistent during replay is + guaranteed when VLLM_LOGGING_LEVEL == "DEBUG". + """ + + def __init__(self, + runnable: Callable, + vllm_config: VllmConfig, + runtime_mode: CUDAGraphMode, + graph_pool: Any = None, + cudagraph_options: Optional[CUDAGraphOptions] = None): + self.runnable = runnable + self.vllm_config = vllm_config + self.graph_pool = graph_pool + self.runtime_mode = runtime_mode + self.compilation_config = vllm_config.compilation_config + + self.first_run_finished = False + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" + + # assert runtime_mode is not NONE(no aclgraph), otherwise, we don't + # need to initialize a ACLGraphWrapper. + assert self.runtime_mode != CUDAGraphMode.NONE + if self.graph_pool is None: + self.graph_pool = current_platform.get_global_graph_pool() + + if cudagraph_options is None: + cudagraph_options = CUDAGraphOptions() + self.aclgraph_options = cudagraph_options + # the entries for different batch descriptors that we need to capture + # aclgraphs for. + self.concrete_aclgraph_entries: dict[BatchDescriptor, ACLGraphEntry]\ + = {} + + def __getattr__(self, key: str): + # allow accessing the attributes of the runnable. + if hasattr(self.runnable, key): + return getattr(self.runnable, key) + raise AttributeError(f"Attribute {key} not exists in the runnable of " + f"aclgraph wrapper: {self.runnable}") + + def unwrap(self) -> Callable: + # in case we need to access the original runnable. + return self.runnable + + def __call__(self, *args, **kwargs): + forward_context = get_forward_context() + batch_descriptor = forward_context.batch_descriptor + aclgraph_runtime_mode = forward_context.cudagraph_runtime_mode + + if aclgraph_runtime_mode == CUDAGraphMode.NONE or \ + aclgraph_runtime_mode != self.runtime_mode: + # CUDAGraphMode.NONE could mean the profile run, a warmup run, or + # running without aclgraphs. + # We do not trigger capture/replay if the runtime mode is not + # matches. This enables properly dispatching to the correct + # CUDAGraphWrapper when nesting multiple instances with different + # runtime modes. + return self.runnable(*args, **kwargs) + + if batch_descriptor not in self.concrete_aclgraph_entries: + # create a new entry for this batch descriptor + self.concrete_aclgraph_entries[batch_descriptor] = \ + ACLGraphEntry(batch_descriptor=batch_descriptor) + + entry = self.concrete_aclgraph_entries[batch_descriptor] + + if entry.aclgraph is None: + if self.aclgraph_options.debug_log_enable: + # Since we capture aclgraph for many different shapes and + # capturing is fast, we don't need to log it for every + # shape. E.g. we only log it for the first subgraph in + # piecewise mode. + logger.debug("Capturing a aclgraph on (%s,%s)", + self.runtime_mode.name, entry.batch_descriptor) + # validate that aclgraph capturing is legal at this point. + validate_cudagraph_capturing_enabled() + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses + aclgraph = torch.npu.NPUGraph() + + with ExitStack() as stack: + if self.aclgraph_options.gc_disable: + # during every model forward for piecewise aclgraph + # mode, we will capture many pieces of aclgraphs + # (roughly one per layer). running gc again and again + # across layers will make the aclgraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.npu.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.npu.graph(aclgraph, pool=self.graph_pool): + # `output` is managed by pytorch's aclgraph pool + output = self.runnable(*args, **kwargs) + if self.aclgraph_options.weak_ref_output: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph in piecewise aclgraph mode, because + # the output of the last graph will not be used by + # any other acl graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.aclgraph = aclgraph + + compilation_counter.num_cudagraph_captured += 1 + + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during acl graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + f"Input addresses for aclgraphs are different " + f"during replay. Expected {entry.input_addresses}, " + f"got {new_input_addresses}") + + logger.info_once("Replaying aclgraph") + entry.aclgraph.replay() + return entry.output diff --git a/vllm_ascend/core/__init__.py b/vllm_ascend/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/core/schedule_config.py b/vllm_ascend/core/schedule_config.py new file mode 100644 index 0000000..4ee02e7 --- /dev/null +++ b/vllm_ascend/core/schedule_config.py @@ -0,0 +1,84 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from dataclasses import dataclass, fields +from typing import Type, Union + +from vllm.config import SchedulerConfig + + +@dataclass +class AscendSchedulerConfig(SchedulerConfig): + enable_chunked_prefill: bool = False + policy: str = "fcfs" + num_scheduler_steps: int = 1 + scheduler_cls: Union[str, Type[object]] = ( + "vllm_ascend.core.scheduler.AscendScheduler") + + @classmethod + def initialize_from_config( + cls, + vllm_scheduler_config: SchedulerConfig, + ascend_scheduler_config, + ): + scheduler_config = { + field.name: getattr(vllm_scheduler_config, field.name) + for field in fields(vllm_scheduler_config) if field.init + } + # Override default values into original SchedulerConfig + scheduler_config["enable_chunked_prefill"] = False + scheduler_config["policy"] = "fcfs" + scheduler_config["num_scheduler_steps"] = 1 + scheduler_config["scheduler_cls"] = ( + "vllm_ascend.core.scheduler.AscendScheduler") + # Override params in original SchedulerConfig with params in ascend_scheduler_config + for k, _ in scheduler_config.items(): + if hasattr(ascend_scheduler_config, k): + scheduler_config[k] = getattr(ascend_scheduler_config, k) + return cls(**scheduler_config) + + def __post_init__(self) -> None: + self.max_num_encoder_input_tokens = self.max_num_batched_tokens + self.encoder_cache_size = self.max_num_batched_tokens + self.chunked_prefill_enabled = self.enable_chunked_prefill + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): + raise ValueError( + "Ascend scheduler is enabled without chunked prefill feature. " + f"Argument max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") + if self.policy != "fcfs": + raise NotImplementedError( + f"currently AscendScheduler only supports fcfs policy, got {self.policy}" + ) + if self.is_multimodal_model: + raise NotImplementedError( + "currently AscendScheduler only supports LLM models.") + if self.num_scheduler_steps > 1: + raise NotImplementedError( + "currently AscendScheduler doesn't support multi-step.") + if self.send_delta_data: + raise NotImplementedError( + "currently AscendScheduler doesn't support send_delta_data.") + if self.delay_factor > 0: + raise NotImplementedError( + "currently AscendScheduler doesn't support scheduler_delay_factor." + ) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py new file mode 100644 index 0000000..f8c7f49 --- /dev/null +++ b/vllm_ascend/core/scheduler.py @@ -0,0 +1,538 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +import time +from collections import deque +from typing import Iterable, Union + +from vllm.config import VllmConfig +from vllm.distributed.kv_events import KVEventBatch +from vllm.logger import logger +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.utils import cdiv +from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.engine import EngineCoreEventType, EngineCoreOutputs +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus +from vllm.v1.structured_output import StructuredOutputManager + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + from vllm.v1.core.kv_cache_manager import KVCacheBlocks +else: + KVCacheBlocks = None + + +class AscendScheduler(Scheduler): + """This Scheduler extends vllm's original v1 scheduler + with prefill-first scheduling strategy.""" + + def __init__( + self, + vllm_config: VllmConfig, + kv_cache_config: KVCacheConfig, + structured_output_manager: StructuredOutputManager, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + include_finished_set: bool = False, + log_stats: bool = False, + ) -> None: + super().__init__(vllm_config, kv_cache_config, + structured_output_manager, mm_registry, + include_finished_set, log_stats) + self.scheduled_req_ids: set[str] = set() + self.running: list[Request] = [] + + def schedule(self) -> SchedulerOutput: + if self.scheduler_config.chunked_prefill_enabled: + return super().schedule() + scheduled_new_reqs: list[Request] = [] + scheduled_resumed_reqs: list[Request] = [] + scheduled_running_reqs: list[Request] = [] + preempted_reqs: list[Request] = [] + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + req_to_new_block_ids: dict[str, list[list[int]]] = {} + else: + req_to_new_blocks: dict[str, KVCacheBlocks] = {} + num_scheduled_tokens: dict[str, int] = {} + token_budget = self.max_num_scheduled_tokens + # Spec decode-related. + scheduled_spec_decode_tokens: dict[str, list[int]] = {} + + # For logging. + scheduled_timestamp = time.monotonic() + + # Record scheduled LoRA requests. + scheduled_loras: set[int] = set() + + # Use a temporary deque to collect requests that need to be skipped + # and put back at the head of the waiting queue later + skipped_waiting_requests: deque[Request] = deque() + + # Schedule prefill requests first. + while self.waiting and token_budget > 0: + if len(self.running) == self.max_num_running_reqs: + break + + request = self.waiting[0] + + def skip_cur_request(): + self.waiting.popleft() + skipped_waiting_requests.appendleft(request) + + # P/D: skip request if still waiting for remote kvs. + if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS: + is_ready = self._update_waiting_for_remote_kv(request) + if is_ready: + request.status = RequestStatus.WAITING + else: + skip_cur_request() + continue + + # Check that adding the request still respects the max_loras + # constraint. + if (self.lora_config and request.lora_request and + (len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id not in scheduled_loras)): + # Scheduling would exceed max_loras, skip. + skip_cur_request() + continue + + num_external_computed_tokens = 0 + load_kv_async = False + + # Get already-cached tokens. + if request.num_computed_tokens == 0: + new_computed_blocks, num_new_local_computed_tokens = \ + self.kv_cache_manager.get_computed_blocks( + request) + + # Get externally-cached tokens if using a KVConnector. + if self.connector is not None: + num_external_computed_tokens, load_kv_async = ( + self.connector.get_num_new_matched_tokens( + request, num_new_local_computed_tokens)) + + # Total computed tokens (local + external). + num_computed_tokens = (num_new_local_computed_tokens + + num_external_computed_tokens) + else: + # P/D: skip checking prefix cache if loaded from remote kvs. + new_computed_blocks = ( + self.kv_cache_manager.create_empty_block_list()) + num_new_local_computed_tokens = 0 + num_computed_tokens = request.num_computed_tokens + + # P/D: loading remote KV, do not allocate for new work. + if load_kv_async: + assert num_external_computed_tokens > 0 + num_new_tokens = 0 + blocks = None + # Number of tokens to be scheduled. + else: + prompt_limit = self._get_prompt_limit(request) + # We use `request.num_tokens` instead of + # `request.num_prompt_tokens` to consider the resumed + # requests, which have output tokens. + num_new_tokens = request.num_tokens - num_computed_tokens + max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * + self.block_size) + prompt_limit = min(prompt_limit, max_tokens_in_kvcache) + + # Finish request that exceeds prompt_limit or kv cache size. + if num_new_tokens > prompt_limit: + logger.warning( + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", + num_new_tokens, + prompt_limit, + ) + request.status = RequestStatus.FINISHED_IGNORED + self.finished_req_ids.add( # type: ignore + request.request_id) # type: ignore + self.waiting.popleft() + continue + + if num_new_tokens > token_budget: + # Scheduling would exceed token_budget, skip. + skip_cur_request() + continue + assert num_new_tokens > 0 + blocks = new_computed_blocks.blocks[0] + + watermark = getattr(self.scheduler_config, "watermark", 0.01) + if not self._check_watermark_for_prefill(request, num_new_tokens, + blocks, watermark): + # Scheduling would exceed watermark, skip. + skip_cur_request() + continue + + new_blocks = self.kv_cache_manager.allocate_slots( + request, + num_new_tokens + num_external_computed_tokens, + num_new_local_computed_tokens, + new_computed_blocks=new_computed_blocks, + num_lookahead_tokens=self.num_lookahead_tokens, + delay_cache_blocks=load_kv_async) + if new_blocks is None: + # The request cannot be scheduled. + break + + # KVConnector: update internal state after allocation. + # This information is used to determine if a load is + # needed for this request. + if self.connector is not None: + self.connector.update_state_after_alloc( + request, + new_computed_blocks + new_blocks, + num_external_computed_tokens, + ) + + self.waiting.popleft() + if load_kv_async: + # If loading async, allocate memory and put request + # into the WAITING_FOR_REMOTE_KV state. + skipped_waiting_requests.appendleft(request) + request.status = RequestStatus.WAITING_FOR_REMOTE_KVS + continue + + self.running.append(request) + if self.log_stats: + request.record_event(EngineCoreEventType.SCHEDULED, + scheduled_timestamp) + self.scheduled_req_ids.add(request.request_id) + # Check request status. + if request.status == RequestStatus.WAITING: + scheduled_new_reqs.append(request) + elif request.status == RequestStatus.PREEMPTED: + scheduled_resumed_reqs.append(request) + else: + raise RuntimeError(f"Invalid request status: {request.status}") + + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + req_to_new_block_ids[request.request_id] = ( + self.kv_cache_manager.get_block_ids(request.request_id)) + else: + req_to_new_blocks[ + request.request_id] = self.kv_cache_manager.get_blocks( + request.request_id) + # Update request info. + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + request.status = RequestStatus.RUNNING + request.num_computed_tokens = num_computed_tokens + # Count the number of prefix cached tokens. + if request.num_cached_tokens < 0: + request.num_cached_tokens = num_computed_tokens + + # Put back any skipped requests at the head of the waiting queue + if skipped_waiting_requests: + self.waiting.extendleft(skipped_waiting_requests) + + # If no prefill requests are scheduled, + # Schedule decode requests next. + if len(self.scheduled_req_ids) == 0: + req_index = 0 + while req_index < len(self.running) and token_budget > 0: + request = self.running[req_index] + if request.request_id in self.scheduled_req_ids: + # This request has already been scheduled. + req_index += 1 + continue + + num_new_tokens = (request.num_tokens_with_spec - + request.num_computed_tokens) + assert (request.num_tokens - request.num_computed_tokens) == 1 + num_new_tokens = min(num_new_tokens, token_budget) + # Make sure the input position does not exceed the max model len. + # This is necessary when using spec decoding. + num_new_tokens = min( + num_new_tokens, + self.max_model_len - request.num_computed_tokens) + # Check that adding the request still respects the max_loras + # constraint. + if self.lora_config and request.lora_request and ( + len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id + not in scheduled_loras): + # Scheduling would exceed max_loras, skip. + num_new_tokens = 0 + + if num_new_tokens == 0: + # The request cannot be scheduled because one of the following + # reason: + # 1. No new tokens to schedule. This may happen when PP>1 and + # we have already scheduled all prompt tokens but they are + # not finished yet. + # 2. Adding the request exceeds the max_loras constraint. + # NOTE(woosuk): Here, by doing `continue` instead of `break`, + # we do not strictly follow the FCFS scheduling policy and + # allow the lower-priority requests to be scheduled. + req_index += 1 + continue + + while True: + new_blocks = self.kv_cache_manager.allocate_slots( + request, + num_new_tokens, + num_lookahead_tokens=self.num_lookahead_tokens) + if new_blocks is None: + # The request cannot be scheduled. + # Preempt the lowest-priority request. + preempted_req = self.running.pop() + self.kv_cache_manager.free(preempted_req) + preempted_req.status = RequestStatus.PREEMPTED + preempted_req.num_computed_tokens = 0 + if self.log_stats: + preempted_req.record_event( + EngineCoreEventType.PREEMPTED, + scheduled_timestamp) + self.waiting.appendleft(preempted_req) + preempted_reqs.append(preempted_req) + if preempted_req == request: + # No more request to preempt. + can_schedule = False + break + else: + # The request can be scheduled. + can_schedule = True + break + if not can_schedule: + break + assert new_blocks is not None + + # Schedule the request. + scheduled_running_reqs.append(request) + self.scheduled_req_ids.add(request.request_id) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + req_to_new_block_ids[request.request_id] = ( + new_blocks.get_block_ids()) + else: + req_to_new_blocks[request.request_id] = new_blocks + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + + # Speculative decode related. + if request.spec_token_ids: + num_scheduled_spec_tokens = (num_new_tokens + + request.num_computed_tokens - + request.num_tokens) + if num_scheduled_spec_tokens > 0: + # Trim spec_token_ids list to num_scheduled_spec_tokens. + del request.spec_token_ids[num_scheduled_spec_tokens:] + scheduled_spec_decode_tokens[request.request_id] = ( + request.spec_token_ids) + + # Record scheduled LoRA requests. + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + + # Check if the scheduling constraints are satisfied. + total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) + assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens + assert token_budget >= 0 + assert len(self.running) <= self.max_num_running_reqs + assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len( + scheduled_running_reqs) <= len(self.running) + + # Get the longest common prefix among all requests in the running queue. + # This can be potentially used for cascade attention. + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) + if self.running: + any_request = self.running[0] + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request, len(self.running))) + + # Construct the scheduler output. + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_block_ids[req.request_id]) + for req in scheduled_new_reqs + ] + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_block_ids) + else: + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids()) + for req in scheduled_new_reqs + ] + + cached_reqs_data = self._make_cached_request_data( + scheduled_running_reqs, scheduled_resumed_reqs, + num_scheduled_tokens, scheduled_spec_decode_tokens, + req_to_new_blocks) + scheduled_cached_reqs = cached_reqs_data + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + scheduler_output = SchedulerOutput( + scheduled_new_reqs=new_reqs_data, + scheduled_cached_reqs=scheduled_cached_reqs, + num_scheduled_tokens=num_scheduled_tokens, + total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=num_common_prefix_blocks, + # finished_req_ids is an existing state in the scheduler, + # instead of being newly scheduled in this step. + # It contains the request IDs that are finished in between + # the previous and the current steps. + finished_req_ids=self.finished_req_ids, # type: ignore + free_encoder_input_ids=self.encoder_cache_manager. + get_freed_ids(), + structured_output_request_ids={}, + grammar_bitmask=None, + ) + else: + scheduler_output = SchedulerOutput( + scheduled_new_reqs=new_reqs_data, + scheduled_cached_reqs=scheduled_cached_reqs, + num_scheduled_tokens=num_scheduled_tokens, + total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=num_common_prefix_blocks, + # finished_req_ids is an existing state in the scheduler, + # instead of being newly scheduled in this step. + # It contains the request IDs that are finished in between + # the previous and the current steps. + finished_req_ids=self.finished_req_ids, # type: ignore + free_encoder_mm_hashes=self.encoder_cache_manager. + get_freed_mm_hashes(), + structured_output_request_ids={}, + grammar_bitmask=None, + ) + + # NOTE(Kuntai): this function is designed for multiple purposes: + # 1. Plan the KV cache store + # 2. Wrap up all the KV cache load / save ops into an opaque object + # 3. Clear the internal states of the connector + if self.connector is not None: + meta = self.connector.build_connector_meta(scheduler_output) + scheduler_output.kv_connector_metadata = meta + + events = self.kv_cache_manager.take_events() + if events: + batch = KVEventBatch(ts=time.time(), events=events) + self.kv_event_publisher.publish(batch) + + # Advance the number of computed tokens for the request AFTER + # the request is scheduled. + # 1. The scheduler_output of the current step has to include the + # original number of scheduled tokens to determine input IDs. + # 2. Advance the number of computed tokens here allowing us to + # schedule the prefill request again immediately in the next + # scheduling step. + # 3. If some tokens (e.g. spec tokens) are rejected later, the number of + # computed tokens will be adjusted in update_from_output. + for req_id, num_scheduled_token in num_scheduled_tokens.items(): + self.requests[req_id].num_computed_tokens += num_scheduled_token + + self.finished_req_ids = set() # type: ignore + return scheduler_output + + def _check_watermark_for_prefill(self, + request, + num_new_tokens, + computed_blocks, + watermark=0.01): + computed_blocks = computed_blocks or [] + watermark_blocks = self.kv_cache_config.num_blocks * watermark + num_computed_tokens = (request.num_computed_tokens + + len(computed_blocks) * self.block_size) + num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, + self.block_size) + req_blocks = self.kv_cache_manager.coordinator.get_blocks( + request.request_id) + num_new_blocks = (num_required_blocks - len(req_blocks[0]) - + len(computed_blocks)) + num_evictable_computed_blocks = sum(1 for blk in computed_blocks + if blk.ref_cnt == 0) + # If number of free blocks is less than water mark after allocating, don't allocate. + if (self.kv_cache_manager.block_pool.get_num_free_blocks() - + num_evictable_computed_blocks - + num_new_blocks) < watermark_blocks: + return False + return True + + def _get_prompt_limit(self, request: Request) -> int: + if (self.scheduler_config.chunked_prefill_enabled + and not self.scheduler_config.is_multi_step): + prompt_limit = self.scheduler_config.max_model_len + else: + prompt_limit = min( + self.scheduler_config.max_model_len, + self.scheduler_config.max_num_batched_tokens, + ) + + # Model is fine tuned with long context. Return the fine tuned max_len. + if request.lora_request and request.lora_request.long_lora_max_len: + assert prompt_limit <= request.lora_request.long_lora_max_len + return request.lora_request.long_lora_max_len + else: + return prompt_limit + + def finish_requests( + self, + request_ids: Union[str, Iterable[str]], + finished_status: RequestStatus, + ) -> None: + """Handles the finish signal from outside the scheduler. + + For example, the API server can abort a request when the client + disconnects. + """ + for req_id in request_ids: + request = self.requests.get(req_id) + if request is None: + # Invalid request ID. + continue + if request.status == RequestStatus.RUNNING: + self.scheduled_req_ids.discard(request.request_id) + super().finish_requests(request_ids, finished_status) + + def update_from_output( + self, + scheduler_output: SchedulerOutput, + model_runner_output: ModelRunnerOutput, + ) -> EngineCoreOutputs: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. + for request in self.running: + req_id = request.request_id + num_tokens_scheduled = num_scheduled_tokens.get(req_id, 0) + if num_tokens_scheduled == 0: + # The request was not scheduled in this step. + continue + if req_id in self.scheduled_req_ids: + self.scheduled_req_ids.remove(req_id) + + return super().update_from_output(scheduler_output, + model_runner_output) diff --git a/vllm_ascend/device_allocator/__init__.py b/vllm_ascend/device_allocator/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/device_allocator/camem.py b/vllm_ascend/device_allocator/camem.py new file mode 100644 index 0000000..1bd97ab --- /dev/null +++ b/vllm_ascend/device_allocator/camem.py @@ -0,0 +1,278 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# CANN-mem-based pytorch pluggable allocator to implement sleep mode. +# +import dataclasses +import os +from contextlib import contextmanager +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import torch +from acl.rt import memcpy # type: ignore # noqa: F401 +from vllm.logger import logger + +from vllm_ascend.platform import NPUPlatform + + +def find_loaded_library(lib_name) -> Optional[str]: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found_line = None + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found_line = line + break + if found_line is None: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = found_line.index("/") + path = found_line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" + return path + + +camem_available = False +try: + from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401 + init_module, python_create_and_map, python_unmap_and_release) + lib_name = find_loaded_library("vllm_ascend_C") + camem_available = True +except ImportError as e: + logger.warning( + "Failed to import vllm_ascend_C:%s. Sleep mode will be disabled. ", e) + init_module = None + python_create_and_map = None + python_unmap_and_release = None + lib_name = None + libcudart = None + +# py_device, py_alignedSize, py_d_mem, py_p_memHandle +HandleType = Tuple[int, int, int, int] + + +@dataclasses.dataclass +class AllocationData: + handle: HandleType + tag: str + cpu_backup_tensor: Optional[torch.Tensor] = None + + +def create_and_map(allocation_handle: HandleType) -> None: + python_create_and_map(*allocation_handle) + + +def unmap_and_release(allocation_handle: HandleType) -> None: + python_unmap_and_release(*allocation_handle) + + +def get_pluggable_allocator( + python_malloc_fn: Callable[[tuple[int, int, int, int]], None], + python_free_func: Callable[[int], tuple[int, int, int, int]] +) -> torch.npu.memory.NPUPluggableAllocator: + init_module(python_malloc_fn, python_free_func) + new_alloc = torch.npu.memory.NPUPluggableAllocator(lib_name, 'my_malloc', + 'my_free') + return new_alloc + + +@contextmanager +def use_memory_pool_with_allocator( + python_malloc_fn: Callable[[tuple[int, int, int, int]], None], + python_free_func: Callable[[int], tuple[int, int, int, int]]): + new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) + mem_pool = torch.npu.memory.MemPool(new_alloc._allocator) + with torch.npu.memory.use_mem_pool(mem_pool): + yield mem_pool, new_alloc + + +class CaMemAllocator: + """ + A singleton class that manages a memory pool for CANN tensors. + The memory in this pool can be offloaded or discarded when the + allocator sleeps. + Inside the `use_memory_pool(tag)` context, all tensors created will + be allocated in the memory pool, and has the same tag as the + tag passed to the context. + When we call `sleep`, all tensors with the specified tag will be + offloaded to CPU memory, and the rest of the tensors will be discarded. + When we call `wake_up`, all tensors that are previously offloaded + will be loaded back to GPU memory, and the rest of the tensors will + have empty memory. + Why it needs to be a singleton? + When allocated tensors are garbage collected, PyTorch will call + the free callback, which will call the `python_free_callback` method. + The C-extension uses a global variable to store the function of an + instance of this class. If we create multiple instances of this class, + the global variable will be overwritten and the free callback will + not work as expected. + """ + instance = None + default_tag: str = "default" + + @staticmethod + def get_instance() -> "CaMemAllocator": + """ + CaMemAllocator is a singleton class. + We cannot call the constructor directly. + Call this method to get the instance. + """ + if CaMemAllocator.instance is None: + CaMemAllocator.instance = CaMemAllocator() + return CaMemAllocator.instance + + def __init__(self): + conf = os.environ.get("PYTORCH_NPU_ALLOC_CONF", "") + assert "expandable_segments:True" not in conf, \ + ("Expandable segments are not compatible with memory pool. " + "Please track https://github.com/pytorch/pytorch/issues/147851 " + "for the latest updates.") + + self.pointer_to_data: Dict[int, AllocationData] = {} + self.current_tag: str = CaMemAllocator.default_tag + self.allocator_and_pools: Dict[str, Any] = {} + + def python_malloc_callback(self, allocation_handle: HandleType) -> None: + """ + Internal method to store the allocation data + when memory is allocated in the memory pool.""" + py_d_mem = allocation_handle[2] + self.pointer_to_data[py_d_mem] = AllocationData( + allocation_handle, self.current_tag) + return + + def python_free_callback(self, ptr: int) -> HandleType: + """ + Internal method to look up the allocation data + when memory is freed in the memory pool.""" + data = self.pointer_to_data.pop(ptr) + if data.cpu_backup_tensor is not None: + data.cpu_backup_tensor = None + return data.handle + + def sleep( + self, + offload_tags: Optional[Union[Tuple[str, ...], + str]] = None) -> None: + """ + Put the allocator in sleep mode. + All data in the memory allocation with the specified tag will be + offloaded to CPU memory, and others will be discarded. + :param offload_tags: The tags of the memory allocation that will be + offloaded. The rest of the memory allocation will be discarded. + """ + if offload_tags is None: + # by default, allocated tensors are offloaded + # when the allocator sleeps + offload_tags = (CaMemAllocator.default_tag, ) + elif isinstance(offload_tags, str): + offload_tags = (offload_tags, ) + + assert isinstance(offload_tags, tuple) + + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + if data.tag in offload_tags: + size_in_bytes = handle[1] + cpu_backup_tensor = torch.empty( + size_in_bytes, + dtype=torch.uint8, + device='cpu', + pin_memory=NPUPlatform.is_pin_memory_available()) + cpu_ptr = cpu_backup_tensor.data_ptr() + ACL_MEMCPY_DEVICE_TO_HOST = 2 + dest_max = cpu_ptr + size_in_bytes * 2 + memcpy(cpu_ptr, dest_max, ptr, size_in_bytes, + ACL_MEMCPY_DEVICE_TO_HOST) + data.cpu_backup_tensor = cpu_backup_tensor + unmap_and_release(handle) + + def wake_up(self, tags: Optional[list[str]] = None) -> None: + """ + Wake up the allocator from sleep mode. + All data that is previously offloaded will be loaded back to GPU + memory, and the rest of the data will have empty memory.""" + for ptr, data in self.pointer_to_data.items(): + if tags is None or data.tag in tags: + handle = data.handle + create_and_map(handle) + if data.cpu_backup_tensor is not None: + cpu_backup_tensor = data.cpu_backup_tensor + if cpu_backup_tensor is not None: + size_in_bytes = cpu_backup_tensor.numel( + ) * cpu_backup_tensor.element_size() + cpu_ptr = cpu_backup_tensor.data_ptr() + ACL_MEMCPY_HOST_TO_DEVICE = 1 + dest_max = ptr + size_in_bytes * 2 + memcpy(ptr, dest_max, cpu_ptr, size_in_bytes, + ACL_MEMCPY_HOST_TO_DEVICE) + data.cpu_backup_tensor = None + + @contextmanager + def use_memory_pool(self, tag: Optional[str] = None): + """ + A context manager to use the memory pool. + All memory allocation created inside the context will be allocated + in the memory pool, and has the specified tag. + :param tag: The tag of the memory allocation. If None, the default tag + will be used. + """ + if tag is None: + tag = CaMemAllocator.default_tag + + assert isinstance(tag, str) + + old_tag = self.current_tag + self.current_tag = tag + with use_memory_pool_with_allocator(self.python_malloc_callback, + self.python_free_callback) as data: + # start to hit another PyTorch bug in PyTorch 2.6, + # possibly because of gc-related issue w.r.t. the allocator and + # the memory pool. + # to avoid the issue, we keep a reference of the data. + # see https://github.com/pytorch/pytorch/issues/146431 . + self.allocator_and_pools[tag] = data + yield + # PyTorch's bug, calling torch.cuda.empty_cache() will error + # when using pluggable allocator, see + # https://github.com/pytorch/pytorch/issues/145168 . + # if we have some memory allocated and then freed, + # the memory will not be released. + # right now it is fine, because we only use this allocator + # during weight loading and kv cache creation, where we only + # allocate memory. + # TODO: we need to find a way to release the memory, + # i.e. calling torch.cuda.empty_cache() + self.current_tag = old_tag + + def get_current_usage(self) -> int: + """ + Get the total number of bytes allocated in the memory pool. + """ + sum_bytes: int = 0 + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + sum_bytes += handle[1] + return sum_bytes diff --git a/vllm_ascend/distributed/__init__.py b/vllm_ascend/distributed/__init__.py new file mode 100644 index 0000000..458b814 --- /dev/null +++ b/vllm_ascend/distributed/__init__.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm.distributed.kv_transfer.kv_connector.factory import \ + KVConnectorFactory + +KVConnectorFactory.register_connector( + "LLMDataDistCMgrConnector", + "vllm_ascend.distributed.llmdatadist_c_mgr_connector", + "LLMDataDistCMgrConnector") + +KVConnectorFactory.register_connector( + "MooncakeConnectorV1", "vllm_ascend.distributed.mooncake_connector", + "MooncakeConnector") diff --git a/vllm_ascend/distributed/communication_op.py b/vllm_ascend/distributed/communication_op.py new file mode 100644 index 0000000..2e475f5 --- /dev/null +++ b/vllm_ascend/distributed/communication_op.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch +from vllm.distributed.parallel_state import get_dp_group + + +def data_parallel_reduce_scatter(input_: torch.Tensor, + dim: int = -1) -> torch.Tensor: + """Reduce-Scatter the input tensor across data parallel group.""" + return get_dp_group().reduce_scatter(input_, dim) diff --git a/vllm_ascend/distributed/communicator.py b/vllm_ascend/distributed/communicator.py new file mode 100644 index 0000000..7c14bef --- /dev/null +++ b/vllm_ascend/distributed/communicator.py @@ -0,0 +1,75 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from typing import List, Optional + +import torch +import torch.distributed as dist +from vllm.distributed.device_communicators.base_device_communicator import \ + DeviceCommunicatorBase + + +class NPUCommunicator(DeviceCommunicatorBase): + + def __init__(self, + cpu_group: dist.ProcessGroup, + device: Optional[torch.device] = None, + device_group: Optional[dist.ProcessGroup] = None, + unique_name: str = ""): + super().__init__(cpu_group, device, device_group, unique_name) + # TODO(hz): Refer to CudaCommunicator's implementation to integrate PyHcclCommunicator + # init device according to rank + self.device = torch.npu.current_device() + + def all_to_all(self, + input_: torch.Tensor, + scatter_dim: int = 0, + gather_dim: int = -1, + scatter_sizes: Optional[List[int]] = None, + gather_sizes: Optional[List[int]] = None) -> torch.Tensor: + + if scatter_dim < 0: + scatter_dim += input_.dim() + if gather_dim < 0: + gather_dim += input_.dim() + + if scatter_sizes is not None and gather_sizes is not None: + input_list = [ + t.contiguous() + for t in torch.split(input_, scatter_sizes, scatter_dim) + ] + output_list = [] + tensor_shape_base = input_list[self.rank].size() + for i in range(self.world_size): + tensor_shape = list(tensor_shape_base) + tensor_shape[gather_dim] = gather_sizes[i] + output_list.append( + torch.empty(tensor_shape, + dtype=input_.dtype, + device=input_.device)) + + else: + input_list = [ + t.contiguous() for t in torch.tensor_split( + input_, self.world_size, scatter_dim) + ] + output_list = [ + torch.empty_like(input_list[i]) for i in range(self.world_size) + ] + + dist.all_to_all(output_list, input_list, group=self.device_group) + output_tensor = torch.cat(output_list, dim=gather_dim).contiguous() + return output_tensor diff --git a/vllm_ascend/distributed/device_communicators/__init__.py b/vllm_ascend/distributed/device_communicators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/distributed/device_communicators/pyhccl.py b/vllm_ascend/distributed/device_communicators/pyhccl.py new file mode 100644 index 0000000..984ece7 --- /dev/null +++ b/vllm_ascend/distributed/device_communicators/pyhccl.py @@ -0,0 +1,165 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Union + +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup, ReduceOp +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import logger + +from vllm_ascend.distributed.device_communicators.pyhccl_wrapper import ( + HCCLLibrary, aclrtStream_t, buffer_type, hcclComm_t, hcclDataTypeEnum, + hcclRedOpTypeEnum, hcclUniqueId) +from vllm_ascend.utils import current_stream + + +class PyHcclCommunicator: + + def __init__( + self, + group: Union[ProcessGroup, StatelessProcessGroup], + device: Union[int, str, torch.device], + library_path: Optional[str] = None, + ): + """ + Args: + group: the process group to work on. If None, it will use the + default process group. + device: the device to bind the PyHcclCommunicator to. If None, + it will be bind to f"npu:{local_rank}". + library_path: the path to the HCCL library. If None, it will + use the default library path. + It is the caller's responsibility to make sure each communicator + is bind to a unique device. + """ + + if not isinstance(group, StatelessProcessGroup): + assert dist.is_initialized() + assert dist.get_backend(group) != dist.Backend.HCCL, ( + "PyHcclCommunicator should be attached to a non-HCCL group.") + # note: this rank is the rank in the group + self.rank = dist.get_rank(group) + self.world_size = dist.get_world_size(group) + else: + self.rank = group.rank + self.world_size = group.world_size + + self.group = group + + # if world_size == 1, no need to create communicator + if self.world_size == 1: + self.available = False + self.disabled = True + return + + try: + self.hccl = HCCLLibrary(library_path) + except Exception: + # disable because of missing HCCL library + # e.g. in a non-NPU environment + self.available = False + self.disabled = True + return + + self.available = True + self.disabled = False + + logger.info("vLLM is using pyhccl") + + if isinstance(device, int): + device = torch.device(f"npu:{device}") + elif isinstance(device, str): + device = torch.device(device) + # now `device` is a `torch.device` object + assert isinstance(device, torch.device) + self.device = device + + if self.rank == 0: + # get the unique id from HCCL + with torch.npu.device(device): + self.unique_id = self.hccl.hcclGetUniqueId() + else: + # construct an empty unique id + self.unique_id = hcclUniqueId() + + if not isinstance(group, StatelessProcessGroup): + tensor = torch.ByteTensor(list(self.unique_id.internal)) + ranks = dist.get_process_group_ranks(group) + # arg `src` in `broadcast` is the global rank + dist.broadcast(tensor, src=ranks[0], group=group) + byte_list = tensor.tolist() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + else: + self.unique_id = group.broadcast_obj(self.unique_id, src=0) + + # hccl communicator and stream will use this device + # `torch.npu.device` is a context manager that changes the + # current npu device to the specified one + with torch.npu.device(device): + self.comm: hcclComm_t = self.hccl.hcclCommInitRank( + self.world_size, self.unique_id, self.rank) + + stream = current_stream() + # A small all_reduce for warmup. + data = torch.zeros(1, device=device) + self.all_reduce(data) + stream.synchronize() + del data + + def all_reduce(self, + in_tensor: torch.Tensor, + op: ReduceOp = ReduceOp.SUM, + stream=None) -> torch.Tensor: + if self.disabled: + return None + # hccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert in_tensor.device == self.device, ( + f"this hccl communicator is created to work on {self.device}, " + f"but the input tensor is on {in_tensor.device}") + + out_tensor = torch.empty_like(in_tensor) + + if stream is None: + stream = current_stream() + self.hccl.hcclAllReduce(buffer_type(in_tensor.data_ptr()), + buffer_type(out_tensor.data_ptr()), + in_tensor.numel(), + hcclDataTypeEnum.from_torch(in_tensor.dtype), + hcclRedOpTypeEnum.from_torch(op), self.comm, + aclrtStream_t(stream.npu_stream)) + return out_tensor + + def broadcast(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this hccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = current_stream() + if src == self.rank: + buffer = buffer_type(tensor.data_ptr()) + else: + buffer = buffer_type(tensor.data_ptr()) + self.hccl.hcclBroadcast(buffer, tensor.numel(), + hcclDataTypeEnum.from_torch(tensor.dtype), src, + self.comm, aclrtStream_t(stream.npu_stream)) diff --git a/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py b/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py new file mode 100644 index 0000000..3435cc2 --- /dev/null +++ b/vllm_ascend/distributed/device_communicators/pyhccl_wrapper.py @@ -0,0 +1,253 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ctypes +import platform +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import torch +from torch.distributed import ReduceOp +from vllm.logger import logger + +from vllm_ascend.utils import find_hccl_library + +# export types and functions from hccl to Python === +# for the original hccl definition, please check +# https://github.com/EternalLied/cann-hccl-new/blob/64ec6ce2923319caa5df8c3c531e06bdc148ce9c/inc/hccl/hccl.h#L90 +# https://github.com/EternalLied/cann-hccl-new/blob/64ec6ce2923319caa5df8c3c531e06bdc148ce9c/inc/hccl/hccl_types.h#L48 + +hcclResult_t = ctypes.c_int +hcclComm_t = ctypes.c_void_p + + +class hcclUniqueId(ctypes.Structure): + _fields_ = [("internal", ctypes.c_byte * 4108)] + + +aclrtStream_t = ctypes.c_void_p +buffer_type = ctypes.c_void_p + +hcclDataType_t = ctypes.c_int + + +class hcclDataTypeEnum: + hcclInt8 = 0 + hcclInt16 = 1 + hcclInt32 = 2 + hcclFloat16 = 3 + hcclFloat32 = 4 + hcclInt64 = 5 + hcclUint64 = 6 + hcclUint8 = 7 + hcclUint16 = 8 + hcclUint32 = 9 + hcclFloat64 = 10 + hcclBfloat16 = 11 + hcclInt128 = 12 + + @classmethod + def from_torch(cls, dtype: torch.dtype) -> int: + if dtype == torch.int8: + return cls.hcclInt8 + if dtype == torch.uint8: + return cls.hcclUint8 + if dtype == torch.int32: + return cls.hcclInt32 + if dtype == torch.int64: + return cls.hcclInt64 + if dtype == torch.float16: + return cls.hcclFloat16 + if dtype == torch.float32: + return cls.hcclFloat32 + if dtype == torch.float64: + return cls.hcclFloat64 + if dtype == torch.bfloat16: + return cls.hcclBfloat16 + raise ValueError(f"Unsupported dtype: {dtype}") + + +hcclRedOp_t = ctypes.c_int + + +class hcclRedOpTypeEnum: + hcclSum = 0 + hcclProd = 1 + hcclMax = 2 + hcclMin = 3 + + @classmethod + def from_torch(cls, op: ReduceOp) -> int: + if op == ReduceOp.SUM: + return cls.hcclSum + if op == ReduceOp.PRODUCT: + return cls.hcclProd + if op == ReduceOp.MAX: + return cls.hcclMax + if op == ReduceOp.MIN: + return cls.hcclMin + raise ValueError(f"Unsupported op: {op}") + + +@dataclass +class Function: + name: str + restype: Any + argtypes: List[Any] + + +class HCCLLibrary: + exported_functions = [ + # const char* HcclGetErrorString(HcclResult code); + Function("HcclGetErrorString", ctypes.c_char_p, [hcclResult_t]), + + # HcclResult HcclGetRootInfo(HcclRootInfo *rootInfo); + Function("HcclGetRootInfo", hcclResult_t, + [ctypes.POINTER(hcclUniqueId)]), + + # HcclResult HcclCommInitRootInfo( + # uint32_t nRanks, const HcclRootInfo *rootInfo, uint32_t rank, HcclComm *comm); + # note that HcclComm is a pointer type, so the last argument is a pointer to a pointer + Function("HcclCommInitRootInfo", hcclResult_t, [ + ctypes.c_int, + ctypes.POINTER(hcclUniqueId), + ctypes.c_int, + ctypes.POINTER(hcclComm_t), + ]), + + # HcclResult HcclAllReduce( + # void *sendBuf, void *recvBuf, uint64_t count, + # HcclDataType dataType, HcclReduceOp op, HcclComm comm, + # aclrtStream stream); + Function("HcclAllReduce", hcclResult_t, [ + buffer_type, + buffer_type, + ctypes.c_size_t, + hcclDataType_t, + hcclRedOp_t, + hcclComm_t, + aclrtStream_t, + ]), + + # HcclResult HcclBroadcast( + # void *buf, uint64_t count, + # HcclDataType dataType, uint32_t root, + # HcclComm comm, aclrtStream stream); + Function("HcclBroadcast", hcclResult_t, [ + buffer_type, + ctypes.c_size_t, + hcclDataType_t, + ctypes.c_int, + hcclComm_t, + aclrtStream_t, + ]), + + # HcclResult HcclCommDestroy(HcclComm comm); + Function("HcclCommDestroy", hcclResult_t, [hcclComm_t]), + ] + + # class attribute to store the mapping from the path to the library + # to avoid loading the same library multiple times + path_to_library_cache: Dict[str, Any] = {} + + # class attribute to store the mapping from library path + # to the correspongding directory + path_to_dict_mapping: Dict[str, Dict[str, Any]] = {} + + def __init__(self, so_file: Optional[str] = None): + + so_file = so_file or find_hccl_library() + + try: + if so_file not in HCCLLibrary.path_to_dict_mapping: + lib = ctypes.CDLL(so_file) + HCCLLibrary.path_to_library_cache[so_file] = lib + self.lib = HCCLLibrary.path_to_library_cache[so_file] + except Exception as e: + logger.error( + "Failed to load HCCL library from %s. " + "It is expected if you are not running on Ascend NPUs." + "Otherwise, the hccl library might not exist, be corrupted " + "or it does not support the current platform %s. " + "If you already have the library, please set the " + "environment variable HCCL_SO_PATH" + " to point to the correct hccl library path.", so_file, + platform.platform()) + raise e + + if so_file not in HCCLLibrary.path_to_dict_mapping: + _funcs: Dict[str, Any] = {} + for func in HCCLLibrary.exported_functions: + f = getattr(self.lib, func.name) + f.restype = func.restype + f.argtypes = func.argtypes + _funcs[func.name] = f + HCCLLibrary.path_to_dict_mapping[so_file] = _funcs + self._funcs = HCCLLibrary.path_to_dict_mapping[so_file] + + def hcclGetErrorString(self, result: hcclResult_t) -> str: + return self._funcs["HcclGetErrorString"](result).decode("utf-8") + + def HCCL_CHECK(self, result: hcclResult_t) -> None: + if result != 0: + error_str = self.hcclGetErrorString(result) + raise RuntimeError(f"HCCL error: {error_str}") + + def hcclGetUniqueId(self) -> hcclUniqueId: + unique_id = hcclUniqueId() + self.HCCL_CHECK(self._funcs["HcclGetRootInfo"]( + ctypes.byref(unique_id))) + return unique_id + + def hcclCommInitRank(self, world_size: int, unique_id: hcclUniqueId, + rank: int) -> hcclComm_t: + comm = hcclComm_t() + self.HCCL_CHECK(self._funcs["HcclCommInitRootInfo"]( + world_size, ctypes.byref(unique_id), rank, ctypes.byref(comm))) + return comm + + def hcclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type, + count: int, datatype: int, op: int, comm: hcclComm_t, + stream: aclrtStream_t) -> None: + # `datatype` actually should be `hcclDataType_t` + # and `op` should be `hcclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.HCCL_CHECK(self._funcs["HcclAllReduce"](sendbuff, recvbuff, count, + datatype, op, comm, + stream)) + + def hcclBroadcast(self, buf: buffer_type, count: int, datatype: int, + root: int, comm: hcclComm_t, + stream: aclrtStream_t) -> None: + self.HCCL_CHECK(self._funcs["HcclBroadcast"](buf, count, datatype, + root, comm, stream)) + + def hcclCommDestroy(self, comm: hcclComm_t) -> None: + self.HCCL_CHECK(self._funcs["HcclCommDestroy"](comm)) + + +__all__ = [ + "HCCLLibrary", + "hcclDataTypeEnum", + "hcclRedOpTypeEnum", + "hcclUniqueId", + "hcclComm_t", + "aclrtStream_t", + "buffer_type", +] diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py new file mode 100644 index 0000000..fe6617a --- /dev/null +++ b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py @@ -0,0 +1,894 @@ +import contextlib +import json +import math +import os +import threading +import time +from collections import defaultdict +from collections.abc import Iterator +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from enum import Enum +from typing import Any, Callable, Optional, Tuple + +import llm_datadist # type: ignore +import msgspec +import torch +import zmq +from llm_datadist import (BlocksCacheKey, CacheDesc, LLMConfig, LLMDataDist, + LLMException, LLMRole) +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.parallel_state import get_tp_group, get_world_group +from vllm.forward_context import ForwardContext +from vllm.utils import get_ip, logger +from vllm.v1.core.kv_cache_manager import KVCacheBlocks +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import Request, RequestStatus + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version + +TORCH_DTYPE_TO_NPU_DTYPE = { + torch.half: llm_datadist.DataType.DT_FLOAT16, + torch.float16: llm_datadist.DataType.DT_FLOAT16, + torch.bfloat16: llm_datadist.DataType.DT_BF16, + torch.float: llm_datadist.DataType.DT_FLOAT, + torch.float32: llm_datadist.DataType.DT_FLOAT, + torch.int8: llm_datadist.DataType.DT_INT8, + torch.int64: llm_datadist.DataType.DT_INT64, + torch.int32: llm_datadist.DataType.DT_INT32 +} + + +class LLMDataDistCMgrEvent(Enum): + ReqForMetadata = 0 + ReqForFinished = 1 + + +class LLMDataDistCMgrAgentMetadata(msgspec.Struct): + super_pod_id: str + server_id: str + device_id: str + device_ip: str + super_device_id: str + cluster_id: int + + +@dataclass +class ReqMeta: + local_block_ids: list[int] + remote_block_ids: list[int] + remote_host: str + remote_port: str + engine_id: str + remote_tp_size: str + + +class LLMDataDistCMgrConnectorMetadata(KVConnectorMetadata): + + def __init__(self): + self.requests: dict[str, ReqMeta] = {} + + def add_new_req(self, request_id: str, local_block_ids: list[int], + kv_transfer_params: dict[str, Any]): + self.requests[request_id] = ReqMeta( + local_block_ids=local_block_ids, + remote_block_ids=kv_transfer_params["remote_block_ids"], + engine_id=kv_transfer_params["remote_engine_id"], + remote_host=kv_transfer_params["remote_host"], + remote_port=kv_transfer_params["remote_port"], + remote_tp_size=kv_transfer_params["remote_tp_size"], + ) + + +class LLMDataDistCMgrConnector(KVConnectorBase_V1): + + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + assert vllm_config.kv_transfer_config is not None + self.engine_id = vllm_config.kv_transfer_config.engine_id + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler: Optional[ + LLMDataDistCMgrConnectorScheduler] = LLMDataDistCMgrConnectorScheduler( + vllm_config, self.engine_id) + elif role == KVConnectorRole.WORKER: + self.connector_scheduler = None + self.connector_worker = LLMDataDistCMgrConnectorWorker(vllm_config) + + ############################################################ + # Scheduler Side Methods + ############################################################ + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens) + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + ############################################################ + # Worker Side Methods + ############################################################ + def register_kv_caches( + self, + kv_caches: dict[ + str, # type: ignore[override] + Tuple[torch.Tensor]]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """Get the finished recving and sending requests.""" + assert self.connector_worker is not None + return self.connector_worker.get_finished(finished_req_ids) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, + LLMDataDistCMgrConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + """LLMDataDistCMgrConnector does not do layerwise saving, the load is in blocking manager.""" + pass + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata, **kwargs) -> None: + """LLMDataDistCMgrConnector does not save explicitly.""" + pass + + def wait_for_save(self): + """LLMDataDistCMgrConnector does not save explicitly.""" + pass + + +class LLMDataDistCMgrConnectorScheduler(): + + def __init__(self, vllm_config: VllmConfig, engine_id: Optional[str]): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + self.engine_id = engine_id + self.local_ip = get_ip() + # Can not retrieve the parallel config since it is not initialized. + self.local_dp_rank = None + self.tp_size = None + dp_rank_local = self.vllm_config.parallel_config.data_parallel_rank_local + tp_size = self.vllm_config.parallel_config.tensor_parallel_size + + self.port = dp_rank_local * tp_size + envs_ascend.VLLM_ASCEND_LLMDD_RPC_PORT if dp_rank_local is not None else tp_size + envs_ascend.VLLM_ASCEND_LLMDD_RPC_PORT + + self._reqs_need_recv: dict[str, tuple[Request, list[int]]] = {} + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + """ + For remote prefill, pull all prompt blocks from remote + asynchronously relative to engine execution. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + Returns: + * the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + * true if the external KV cache tokens will be loaded + asynchronously (between scheduler steps). + """ + + params = request.kv_transfer_params + logger.debug( + f"LLMDataDistCMgrConnector get_num_new_matched_tokens: num_computed_tokens={num_computed_tokens}, kv_transfer_params={params}" + ) + + if params is not None and params.get("do_remote_prefill"): + # Remote prefill: get all prompt blocks from remote. + assert num_computed_tokens % self.block_size == 0 + # Note: We use the full token count as transmit data here. + count = max(len(request.prompt_token_ids) - num_computed_tokens, 0) + return count, count > 0 + + # No remote prefill for this request. + return 0, False + + def update_state_after_alloc(self, request: Request, blocks: KVCacheBlocks, + num_externel_tokens: int): + params = request.kv_transfer_params + logger.debug( + f"LLMDataDistCMgrConnector update states num_externel_tokens: {num_externel_tokens} kv_transfer_params: {params}" + ) + if params is not None and params.get("do_remote_prefill"): + if params.get("remote_block_ids"): + if all(p in params for p in ("remote_engine_id", "remote_host", + "remote_port", "remote_tp_size")): + self._reqs_need_recv[request.request_id] = ( + request, blocks.get_unhashed_block_ids()) + else: + logger.warning("" \ + f"Invalid KVTransferParams {params}, This request will be discard") + else: + assert num_externel_tokens == 0 + params["do_remote_prefill"] = False + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + meta = LLMDataDistCMgrConnectorMetadata() + + for req_id, (req, block_ids) in self._reqs_need_recv.items(): + assert req.kv_transfer_params is not None + meta.add_new_req(request_id=req_id, + local_block_ids=block_ids, + kv_transfer_params=req.kv_transfer_params) + self._reqs_need_recv.clear() + + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + + params = request.kv_transfer_params + logger.debug( + "LLMDataDistCMgrConnector request_finished, request_status=%s, " + "kv_transfer_params=%s", request.status, params) + + if (params is None or not params.get("do_remote_decode") + or request.status != RequestStatus.FINISHED_LENGTH_CAPPED): + return False, None + + # note: NIXL transfer the full block only, but I don't see any reason to do that, so here + # we just transfer any data that computed from prefill node + # note: there might be some issue on this, check it if there is any unexpected result + computed_block_ids = block_ids + delay_free_blocks = len(computed_block_ids) > 0 + if delay_free_blocks: + logger.info("Delaying free of %d blocks for request %s", + len(computed_block_ids), request.request_id) + return delay_free_blocks, dict( + do_remote_prefill=True, + do_remote_decode=False, + remote_block_ids=computed_block_ids, + remote_engine_id=self.engine_id, + remote_host=self.local_ip, + remote_port=self.port, + remote_tp_size=str( + self.vllm_config.parallel_config.tensor_parallel_size), + ) + + +class LLMDataDistCMgrConnectorWorker(): + """ + Implementation of Worker side methods + """ + + def __init__(self, vllm_config: VllmConfig): + assert vllm_config.kv_transfer_config is not None + logger.info("Initialize the LLMDataDistCMgrConnectorWorker") + # we assume the local node only contains dp and tp, and tp will not communicate inter-node. + # for any scenario beyond this scope, the functionality of this connector is not guaranteed. + self.local_rank_on_node = get_world_group().rank % ( + vllm_config.parallel_config.data_parallel_size_local * + vllm_config.parallel_config.tensor_parallel_size) + self.local_rank = get_world_group().local_rank + self.local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local + self.tp_size = vllm_config.parallel_config.tensor_parallel_size + self.tp_rank = get_tp_group().rank_in_group + self.rank = get_world_group().rank + self.local_ip = get_ip() + self.kv_transfer_config: KVTransferConfig = vllm_config.kv_transfer_config + self.local_agent_metadata: Optional[ + LLMDataDistCMgrAgentMetadata] = None + self.vllm_config = vllm_config + self.executor = ThreadPoolExecutor(1) + self.thread_lock = threading.Lock() + + self.llm_datadist_role = None + self.llm_datadist_remote_role = None + if self.kv_transfer_config.kv_role == "kv_producer": + self.llm_datadist_role = LLMRole.PROMPT + self.llm_datadist_remote_role = LLMRole.DECODER + elif self.kv_transfer_config.kv_role == "kv_consumer": + self.llm_datadist_role = LLMRole.DECODER + self.llm_datadist_remote_role = LLMRole.PROMPT + else: + raise RuntimeError( + f"LLMDataDistWorker: Receive unexpected kv role in LLMDataDistWorker, this worker now only support kv_producer and kv_consumer, but receiving {vllm_config.kv_transfer_config.kv_role}" + ) + + # linked_cluster record the cluster that already build the connection its format should be {"cluster_id": "comm_name"} + self.linked_cluster: dict[Any, Any] = {} + self.prefill_device_list: list[tuple[int, int]] = [] + self.decode_device_list: list[tuple[int, int]] = [] + global_rank_table = self.read_offline_rank_table() + self.local_agent_metadata = self.read_agent_metadata(global_rank_table) + self.llm_datadist = LLMDataDist(self.llm_datadist_role, + self.local_agent_metadata.cluster_id) + self.init_llm_datadist() + self.finished_reqs: set[str] = set() + self.soc_info = get_ascend_soc_version() + # Set hccl deterministic for model execute + os.environ["HCCL_DETERMINISTIC"] = "true" + self.done_receiving_counts: defaultdict[str, + set[int]] = defaultdict(set) + + def listen_for_agent_metadata_req(self, event: threading.Event): + assert self.local_agent_metadata is not None + port = envs_ascend.VLLM_ASCEND_LLMDD_RPC_PORT + self.local_dp_rank * self.tp_size + self.tp_rank if self.local_dp_rank is not None else envs_ascend.VLLM_ASCEND_LLMDD_RPC_PORT + self.tp_size + self.tp_rank + url = f"tcp://{envs_ascend.VLLM_ASCEND_LLMDD_RPC_IP}:{port}" + msg_encoder = msgspec.msgpack.Encoder() + msg_decoder = msgspec.msgpack.Decoder() + msg_to_send = msg_encoder.encode(self.local_agent_metadata) + logger.debug(f"Start to listen to address: {url}") + logger.debug( + f"The local agent metadata have {len(msg_to_send)} bytes here") + logger.info( + f"LLMDataDistCMgrConnectorWorker: Cluster {self.local_agent_metadata.cluster_id} start to listen request from peers" + ) + with zmq_ctx(zmq.ROUTER, url) as sock: # type: ignore[attr-defined] + event.set() + while True: + identity, _, msg = sock.recv_multipart() + event_msg, decode_msg = msg_decoder.decode(msg) + event_msg = LLMDataDistCMgrEvent(event_msg) + if event_msg == LLMDataDistCMgrEvent.ReqForMetadata: + if "cluster_id" in decode_msg: + decode_msg = LLMDataDistCMgrAgentMetadata(**decode_msg) + logger.info( + f"LLMDataDistCMgrConnectorWorker: Receive message from cluster {decode_msg.cluster_id}" + ) + sock.send_multipart((identity, b"", msg_to_send)) + self.add_remote_agent(decode_msg) + else: + logger.warning( + f"LLMDataDistCMgrConnectorWorker: receiving unrecognized data {decode_msg}" + ) + elif event_msg == LLMDataDistCMgrEvent.ReqForFinished: + finished_req_id = decode_msg[0] + decode_tp_rank = decode_msg[1] + decode_tp_size = decode_msg[2] + with self.thread_lock: + if self._increment_task_count(finished_req_id, + decode_tp_rank, + decode_tp_size): + logger.debug( + f"LLMDataDistCMgrConnectorWorker: Receiving request {finished_req_id} finished" + ) + self.finished_reqs.add(finished_req_id) + sock.send_multipart( + (identity, b"", b"receiving decode finished")) + else: + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Receiving unexpected request event {event_msg} from remote !" + ) + + def _increment_task_count(self, request_id: str, tp_rank: int, + decode_tp_size: int): + if request_id not in self.done_receiving_counts: + self.done_receiving_counts[request_id] = set() + if tp_rank in self.done_receiving_counts[request_id]: + logger.warning( + f"Received duplicate done signal for request {request_id} " + f"from tp rank {tp_rank}. Ignoring.") + return False + self.done_receiving_counts[request_id].add(tp_rank) + if len(self.done_receiving_counts[request_id]) == decode_tp_size: + self.done_receiving_counts.pop(request_id) + logger.info("All transfers completed for request: " + f"{request_id}. Total ranks: " + f"{decode_tp_size}.") + return True + return False + + def init_llm_datadist(self): + assert self.local_agent_metadata is not None + llm_config = LLMConfig() + llm_config.device_id = self.local_rank + llm_config.sync_kv_timeout = 20000 + llm_config.enable_switch_role = True + llm_config.enable_cache_manager = True + llm_config.enable_remote_cache_accessible = True + llm_config_options = llm_config.generate_options() + self.llm_datadist.init(llm_config_options) + self.cache_manager = self.llm_datadist.cache_manager + logger.info( + f"Done initialize llm_datadist in rank {self.rank}, local rank {self.local_rank}, cluster id {self.local_agent_metadata.cluster_id}" + ) + + def read_offline_rank_table(self): + assert ( + envs_ascend.DISAGGREGATED_PREFILL_RANK_TABLE_PATH + ), "Please set path of rank_table to env variable DISAGGREGATED_PREFILL_RANK_TABLE_PATH" + rank_table_path = envs_ascend.DISAGGREGATED_PREFILL_RANK_TABLE_PATH + with open(rank_table_path, "r", encoding="utf-8") as f: + global_rank_table = json.load(f) + decode_device_list = global_rank_table["decode_device_list"] + for decode_device in decode_device_list: + server_id = decode_device["server_id"] + device_id = decode_device["device_id"] + self.decode_device_list.append((server_id, device_id)) + prefill_device_list = global_rank_table["prefill_device_list"] + for prefill_device in prefill_device_list: + server_id = prefill_device["server_id"] + device_id = prefill_device["device_id"] + self.prefill_device_list.append((server_id, device_id)) + + # global_rank_table = json.dumps(global_rank_table) + return global_rank_table + + @staticmethod + def _get_visible_devices() -> Callable[[str], bool]: + """ + Return a test function that check if the given device ID is visible. + i.e. ASCEND_RT_VISIBLE_DEVICES is not set or contains the device_id. + """ + visible_devices = os.environ.get("ASCEND_RT_VISIBLE_DEVICES", "") + if not visible_devices: + return lambda device_id: True + visible_device_list = visible_devices.split(",") + return lambda device_id: device_id in visible_device_list + + def read_agent_metadata(self, global_rank_table): + device_filter = LLMDataDistCMgrConnectorWorker._get_visible_devices() + devices_type_list = [] + agent_metadata = None + if self.llm_datadist_role == LLMRole.PROMPT: + devices_type_list.append("prefill_device_list") + elif self.llm_datadist_role == LLMRole.DECODER: + devices_type_list.append("decode_device_list") + else: + devices_type_list.append("prefill_device_list") + devices_type_list.append("decode_device_list") + for device_type in devices_type_list: + device_list = global_rank_table[device_type] + device_list = [ + d for d in device_list if d.get("server_id") == self.local_ip + and device_filter(d.get("device_id", "")) + ] + if len(device_list) <= self.tp_rank: + continue + device_info = device_list[self.tp_rank] + super_pod_id_ = device_info.get("super_pod_id", None) + server_id_ = device_info["server_id"] + device_id_ = device_info["device_id"] + device_ip_ = device_info["device_ip"] + super_device_id_ = device_info.get("super_device_id", None) + cluster_id_ = int(device_info["cluster_id"]) + agent_metadata = LLMDataDistCMgrAgentMetadata( + super_pod_id=super_pod_id_, + server_id=server_id_, + device_id=device_id_, + device_ip=device_ip_, + super_device_id=super_device_id_, + cluster_id=cluster_id_, + ) + assert agent_metadata is not None, f"Can't read the target server_id {self.local_ip} and device_rank {self.rank} from rank table" + return agent_metadata + + def register_kv_caches(self, kv_caches: dict[str, Tuple[torch.Tensor]]): + _, first_kv_cache_tuple = next(iter(kv_caches.items())) + first_kv_cache = first_kv_cache_tuple[0] + assert len(first_kv_cache_tuple) > 1 + assert self.local_agent_metadata is not None + kv_cache_dtype = first_kv_cache.dtype + self.use_mla: bool = first_kv_cache_tuple[0].size( + -1) != first_kv_cache_tuple[1].size(-1) + # MLA case. [2 (k_normed, k_pe), num_blocks, ...] + # MHA case. [2 (k and v), num_blocks, ...] + self.num_blocks = first_kv_cache.shape[0] + block_rank = 3 # [block_size, latent_dim] + block_shape = first_kv_cache.shape[-block_rank:] + + self.block_len = math.prod(block_shape) + self.cache_addr: list[int] = [] + alignment = 2 * 1024 * 1024 + if self.use_mla: + cache_k_normed_addr_list = [] + cache_k_pe_addr_list = [] + k_normed = None + k_pe = None + for cache_or_caches in kv_caches.values(): + assert len(cache_or_caches) > 1 + k_normed, k_pe = cache_or_caches[0], cache_or_caches[1] + cache_k_normed_addr_list.append(k_normed.data_ptr()) + cache_k_pe_addr_list.append(k_pe.data_ptr()) + self.cache_addr = (cache_k_normed_addr_list, cache_k_pe_addr_list) + + cache_desc_k_normed = CacheDesc( + len(self.cache_addr[0]), [*k_normed.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + cache_desc_k_pe = CacheDesc( + len(self.cache_addr[1]), [*k_pe.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + cache_key_k_normed = BlocksCacheKey(cluster_id=int( + self.local_agent_metadata.cluster_id), + model_id=0) + cache_key_k_pe = BlocksCacheKey(cluster_id=int( + self.local_agent_metadata.cluster_id), + model_id=1) + self.cache_desc = (cache_desc_k_normed, cache_desc_k_pe) + self.cache_key = (cache_key_k_normed, cache_key_k_pe) + try: + cache_k_normed = self.cache_manager.register_blocks_cache( + self.cache_desc[0], self.cache_addr[0], self.cache_key[0]) + cache_k_pe = self.cache_manager.register_blocks_cache( + self.cache_desc[1], self.cache_addr[1], self.cache_key[1]) + self.cache = (cache_k_normed, cache_k_pe) + logger.info("LLMDataDistWorker: End of register Paged Cache.") + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to register_block_cache, receiving [cache_desc: {self.cache_desc}, cache_addr: {self.cache_addr}, cache_key: {self.cache_key}]" + ) + else: + for cache_or_caches in kv_caches.values(): + for cache in cache_or_caches: + base_addr = cache.data_ptr() + assert base_addr % alignment == 0, "The address of the registered kv cache should be aligned to 2M" + self.cache_addr.append(base_addr) + # register paged kv cache into the llm_cache manager + self.cache_desc = CacheDesc( + len(self.cache_addr), [*cache.shape], + TORCH_DTYPE_TO_NPU_DTYPE[kv_cache_dtype]) + self.cache_key = BlocksCacheKey( + cluster_id=int(self.local_agent_metadata.cluster_id)) + logger.info( + f"num of cache: {len(self.cache_addr)}, size of cache: {[*cache.shape]}, real size of cache: {first_kv_cache.shape}" + ) + try: + self.cache = self.cache_manager.register_blocks_cache( + self.cache_desc, self.cache_addr, self.cache_key) + logger.info( + "LLMDataDistCMgrConnectorWorker: End of register Paged Cache." + ) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to register_block_cache, receiving [cache_desc: {self.cache_desc}, cache_addr: {self.cache_addr}, cache_key: {self.cache_key}]" + ) + self.ready_event = threading.Event() + self.metadata_agent_listener_t = threading.Thread( + target=self.listen_for_agent_metadata_req, + args=(self.ready_event, ), + daemon=True, + name="metadata_agent_listener") + self.metadata_agent_listener_t.start() + self.ready_event.wait() + + def start_load_kv(self, metadata: LLMDataDistCMgrConnectorMetadata): + futures = [] + for req_id, meta in metadata.requests.items(): + logger.debug(f"Start to transmit {req_id}") + future = self.executor.submit( + self._read_blocks, + local_block_ids=meta.local_block_ids, + remote_block_ids=meta.remote_block_ids, + remote_ip=meta.remote_host, + remote_port=int(meta.remote_port), + remote_engine_id=meta.engine_id, + request_id=req_id, + remote_tp_size=meta.remote_tp_size, + ) + futures.append(future) + + def handle_exception(future): + if future.exception(): + logger.error(f"KV transfer task failed: {future.exception()}") + + for future in futures: + future.add_done_callback(handle_exception) + + def add_remote_agent(self, metadata: LLMDataDistCMgrAgentMetadata) -> int: + assert self.local_agent_metadata is not None + remote_cluster_id = metadata.cluster_id + if remote_cluster_id in self.linked_cluster: + logger.debug( + f"LLMDataDistCMgrConnectorWorker: remote cluster_id: {metadata.cluster_id} already linked with this server, skip the connection" + ) + return remote_cluster_id + remote_super_pod_id = metadata.super_pod_id + remote_server_id = metadata.server_id + is_same_server = remote_server_id == self.local_agent_metadata.server_id + is_same_pod = remote_super_pod_id == self.local_agent_metadata.super_pod_id + if self.llm_datadist_role == LLMRole.PROMPT: + prefill_metadata = self.local_agent_metadata + decode_metadata = metadata + else: + prefill_metadata = metadata + decode_metadata = self.local_agent_metadata + comm_name = f"pd_comm_{prefill_metadata.device_ip}_{decode_metadata.device_ip}" + cluster_rank_info = { + prefill_metadata.cluster_id: 0, + decode_metadata.cluster_id: 1 + } + rank_table = {} + rank_table["version"] = "1.2" + rank_table["server_count"] = "1" if is_same_server else "2" + rank_table["status"] = "completed" + + # generate server_list for rank table + rank_table["server_list"] = [] # type: ignore[assignment] + decode_server_device_info = None + prefill_server_device_info = { + "device": [{ + k: v + for k, v in [( + "device_id", prefill_metadata.device_id + ), ("device_ip", prefill_metadata.device_ip + ), ("super_device_id", + prefill_metadata.super_device_id), ("rank_id", "0")] + if v is not None + }], + "server_id": + prefill_metadata.server_id + } + if is_same_server: + prefill_server_device_info["device"].append( # type: ignore[attr-defined] + { + k: v + for k, v in [( + "device_id", decode_metadata.device_id + ), ("device_ip", decode_metadata.device_ip + ), ("super_device_id", + decode_metadata.super_device_id), ("rank_id", "1")] + if v is not None + }) + else: + decode_server_device_info = { + "device": [{ + k: v + for k, v in [( + "device_id", decode_metadata.device_id + ), ("device_ip", decode_metadata.device_ip + ), ("super_device_id", + decode_metadata.super_device_id), ("rank_id", "1")] + if v is not None + }], + "server_id": + decode_metadata.server_id + } + rank_table["server_list"].append( # type: ignore[attr-defined] + prefill_server_device_info) + if decode_server_device_info is not None: + rank_table["server_list"].append( # type: ignore[attr-defined] + decode_server_device_info) + + if self.soc_info == AscendSocVersion.A3: + # generate super_pod_list for rank table + super_pod_list = [] + prefill_super_pod_info = { + "super_pod_id": prefill_metadata.super_pod_id, + "server_list": [{ + "server_id": prefill_metadata.server_id + }], + } + if is_same_pod and not is_same_server: + prefill_super_pod_info[ + "server_list"].append( # type: ignore[attr-defined] + {"server_id": decode_metadata.server_id}) + super_pod_list.append(prefill_super_pod_info) + if not is_same_pod: + decode_super_pod_id = { + "super_pod_id": decode_metadata.super_pod_id, + "server_list": [{ + "server_id": decode_metadata.server_id + }], + } + super_pod_list.append(decode_super_pod_id) + rank_table[ + "super_pod_list"] = super_pod_list # type: ignore[assignment] + logger.info( + f"LLMDataDistCMgrConnectorWorker: try link with remote, comm id: {comm_name}" + ) + logger.info(f"rank table \n{rank_table}") + logger.info(f"comm name: {comm_name}") + logger.info(f"cluster rank info: {cluster_rank_info}") + comm_id = self.llm_datadist.link(comm_name, cluster_rank_info, + json.dumps(rank_table)) + while True: + ret = self.llm_datadist.query_register_mem_status(comm_id=comm_id) + if ret == llm_datadist.RegisterMemStatus.OK: + logger.info( + f"LLMDataDistCMgrConnectorWorker: Linking success, comm id: {comm_id}" + ) + break + elif ret == llm_datadist.RegisterMemStatus.FAILED: + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Linking failed, comm id: {comm_id}" + ) + time.sleep(1) + logger.info("Checking query_register_mem_status again") + self.linked_cluster.update({remote_cluster_id: comm_id}) + logger.info(f"cached linked cluster: {self.linked_cluster}") + logger.info( + f"Successfully build link with cluster id {remote_cluster_id} with cluster name {comm_name} !" + ) + return remote_cluster_id + + def remove_remote_agent(self, cluster_id: int): + if cluster_id not in self.linked_cluster: + logger.warning( + f"LLMDataDistCMgrConnectorWorker: Warning! Can't remove remote client with cluster id {cluster_id} for its not exist in linked_cluster list" + ) + comm_id = self.linked_cluster[cluster_id] + try: + self.llm_datadist.unlink(comm_id) + self.linked_cluster.pop(cluster_id) + except LLMException: + logger.error( + f"Try to remove remote client with cluster id {cluster_id} failed!, program won't terminate, but please carefully check your environment" + ) + logger.info( + f"Successfully remove remote client with cluster id {cluster_id} !" + ) + + def connect_to_remote_agent(self, host: str, port: int) -> int: + url = f"tcp://{host}:{port}" + logger.debug(f"Querying metadata from url: {url}") + msg_encoder = msgspec.msgpack.Encoder() + msg_send = msg_encoder.encode( + [LLMDataDistCMgrEvent.ReqForMetadata, self.local_agent_metadata]) + with zmq_ctx(zmq.REQ, url) as sock: # type: ignore[attr-defined] + logger.info("Try request remote metadata from socket......") + sock.send(msg_send) + metadata_bytes = sock.recv() + decoder = msgspec.msgpack.Decoder() + metadata = decoder.decode(metadata_bytes) + metadata = LLMDataDistCMgrAgentMetadata(**metadata) + logger.info(f"recving metadata: {metadata}") + cluster_id = self.add_remote_agent(metadata) + return cluster_id + + def send_finish_to_remote(self, host: str, port: int, request_id): + url = f"tcp://{host}:{port}" + logger.debug(f"Sending finished to remote: {url}") + msg_encoder = msgspec.msgpack.Encoder() + msg_send = msg_encoder.encode([ + LLMDataDistCMgrEvent.ReqForFinished, + [request_id, self.tp_rank, self.tp_size] + ]) + with zmq_ctx(zmq.REQ, url) as sock: # type: ignore[attr-defined] + try: + sock.send(msg_send) + logger.debug( + f"Request id {request_id} finished message send to remote {url}" + ) + _ = sock.recv() + except Exception as e: + logger.error( + f"Failed to send reqest_id {request_id} to prefill: {e}") + + def _read_blocks( + self, + local_block_ids: list[int], + remote_block_ids: list[int], + remote_ip: str, + remote_port: int, + remote_engine_id: str, + request_id: str, + remote_tp_size: str, + ): + # if remote_ip not in self.linked_cluster: + tp_offset = self.tp_rank % int(remote_tp_size) + remote_cluster_id = self.connect_to_remote_agent( + remote_ip, remote_port + tp_offset) + num_local_blocks = len(local_block_ids) + if num_local_blocks == 0: + return + num_remote_blocks = len(remote_block_ids) + assert num_local_blocks <= num_remote_blocks + if num_local_blocks < num_remote_blocks: + remote_block_ids = remote_block_ids[-num_local_blocks:] + + logger.info(f"remote cluster id is: {remote_cluster_id}") + if self.use_mla: + remote_cache_key_k_normed = BlocksCacheKey( + cluster_id=remote_cluster_id, model_id=0) + remote_cache_key_k_pe = BlocksCacheKey( + cluster_id=remote_cluster_id, model_id=1) + logger.info("Try pull blocks from remote server") + try: + self.cache_manager.pull_blocks( + remote_cache_key_k_normed, + self.cache[0], # type: ignore[has-type] + remote_block_ids, + local_block_ids) + self.cache_manager.pull_blocks( + remote_cache_key_k_pe, + self.cache[1], # type: ignore[has-type] + remote_block_ids, + local_block_ids) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key_k_normed} {remote_cache_key_k_pe}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}" # type: ignore[has-type] + ) + except LLMException: + raise RuntimeError( + "LLMDataDistCMgrConnectorWorker: Timeout during pull_blocks, you can try to increase the sync_kv_timeout config or checking your connect status" + ) + else: + remote_cache_key = BlocksCacheKey(cluster_id=remote_cluster_id) + logger.info("Try pull blocks from remote server") + try: + self.cache_manager.pull_blocks( + remote_cache_key, + self.cache, # type: ignore[has-type] + remote_block_ids, + local_block_ids) + except (TypeError, ValueError): + raise RuntimeError( + f"LLMDataDistCMgrConnectorWorker: Passing unexpected parameter to pull_blocks remote_cache_key: {remote_cache_key}, cache: {self.cache}, local_block_ids: {local_block_ids}, remote_block_ids: {remote_block_ids}" # type: ignore[has-type] + ) + except LLMException: + raise RuntimeError( + "LLMDataDistCMgrConnectorWorker: Timeout during pull_blocks, you can try to increase the sync_kv_timeout config or checking your connect status" + ) + self.send_finish_to_remote(remote_ip, remote_port, request_id) + with self.thread_lock: + self.finished_reqs.add(request_id) + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """Get the finished recving and sending requuests.""" + import copy + with self.thread_lock: + req_ids_to_ret = copy.deepcopy(self.finished_reqs) + self.finished_reqs.clear() + if self.llm_datadist_role == LLMRole.PROMPT: + return req_ids_to_ret, None + else: + return None, req_ids_to_ret + + +# adopt this from https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +@contextlib.contextmanager +def zmq_ctx(socket_type: Any, + addr: str) -> Iterator[zmq.Socket]: # type: ignore[name-defined] + """Context manager for a ZMQ socket""" + + ctx: Optional[zmq.Context] = None # type: ignore[name-defined] + try: + ctx = zmq.Context() # type: ignore[attr-defined] + + if socket_type == zmq.ROUTER: # type: ignore[attr-defined] + socket = ctx.socket(zmq.ROUTER) # type: ignore[attr-defined] + socket.bind(addr) + elif socket_type == zmq.REQ: # type: ignore[attr-defined] + socket = ctx.socket(zmq.REQ) # type: ignore[attr-defined] + socket.connect(addr) + else: + raise ValueError(f"Unexpected socket type: {socket_type}") + + yield socket + finally: + if ctx is not None: + ctx.destroy(linger=0) diff --git a/vllm_ascend/distributed/moe_comm_method.py b/vllm_ascend/distributed/moe_comm_method.py new file mode 100644 index 0000000..aa9bae8 --- /dev/null +++ b/vllm_ascend/distributed/moe_comm_method.py @@ -0,0 +1,556 @@ +from abc import ABC, abstractmethod +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch_npu +from vllm.distributed import tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe import FusedMoEConfig + +from vllm_ascend.distributed.communication_op import \ + data_parallel_reduce_scatter +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version + + +class MoECommMethod(ABC): + """Base class for MoE communication methods.""" + + def __init__(self, moe_config: FusedMoEConfig): + self.moe_config = moe_config + + @abstractmethod + def prepare( + self, hidden_states: torch.Tensor, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Prepare the MoE communication method. + + This method is called before quant_method.apply to prepare the + communication method. It can be used to initialize any necessary + resources or configurations. + """ + pass + + @abstractmethod + def finalize(self, hidden_states: torch.Tensor, + reduce_results: bool) -> torch.Tensor: + """Finalize the MoE communication method. + + This method is called after quant_method.apply to finalize the + communication method. It can be used to clean up any resources or + configurations. + """ + pass + + @abstractmethod + def permute( + self, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + expert_map: torch.Tensor, + num_experts: int, + apply_a8_quantization: bool, + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]: + """Pre-process before MLP. + + Args: + hidden_states (torch.Tensor): Tensor of shape (num_tokens, hidden_size) + topk_ids (torch.Tensor): Tensor of shape (num_tokens, top_k_num) + topk_weights (torch.Tensor): Tensor of shape (num_tokens, top_k_num) + expert_map (torch.Tensor): Tensor of shape (global_num_experts, ) + Mapping from global expert IDs to local expert IDs. + num_experts (int): Number of local experts (experts on this device). + apply_a8_quantization (bool): Whether to apply A8 quantization (W4A8 and W8A8). + + Returns: + tuple[torch.Tensor, torch.Tensor, int]: Return a tuple containing: + - permuted_hidden_states (torch.Tensor): Tensor of shape + (num_tokens * top_k_num, hidden_size) after permuting + hidden_states based on topk_ids. + - expert_tokens (torch.Tensor): Tensor of shape (num_experts, ) + Number of tokens assigned to each expert. + - dynamic_scale (torch.Tensor, optional): Tensor of shape (num_experts, ) + Dynamic scale for each expert, used for quantization. + - group_list_type (int): Type of group list, 0 for `cumsum` + and 1 for `count`. This is mainly for `npu_grouped_matmul` + to determine how to handle the output. + Raises: + NotImplementedError: If the method is not implemented in the subclass. + """ + pass + + @abstractmethod + def unpermute(self, mlp_output: torch.Tensor, + hidden_states: torch.Tensor) -> None: + """Post-process after MLP. + + Args: + mlp_output (torch.Tensor): Tensor of shape + (num_tokens * top_k_num, hidden_size) after MLP. + hidden_states (torch.Tensor): Tensor of shape + (num_tokens, hidden_size) to be updated with the final output. + """ + pass + + +class AllGatherCommImpl(MoECommMethod): + """This implementation is the same as NativeAllGatherCommImpl, + but uses NPU-specific ops for better performance. + + This implementation should be compatible with all scenarios, and + thus it is the default implementation for MoE communication methods. + It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing + and `torch_npu.npu_moe_token_unpermute` for post-processing + to handle the token-to-expert mapping and communication efficiently. + + NOTE(Yizhou): TBH, it is really weird that we were supposed to use + `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing` + or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute` + for pre-processing and post-processing, respectively. + But `npu_moe_finalize_routing` will lead to accuracy issues so we have to + use `torch_npu.npu_moe_token_unpermute` instead. + This is a workaround and should be removed after the issue is fixed. + """ + + def prepare( + self, hidden_states: torch.Tensor, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """When DP size > 1, pad the hidden states and router logits for communication.""" + if self.moe_config.dp_size > 1: + forward_context = get_forward_context() + max_tokens_across_dp = forward_context.max_tokens_across_dp + + self.num_tokens = hidden_states.shape[0] + pad_size = max_tokens_across_dp - self.num_tokens + if pad_size > 0: + hidden_states = nn.functional.pad(hidden_states, + (0, 0, 0, pad_size)) + router_logits = nn.functional.pad(router_logits, + (0, 0, 0, pad_size)) + + hidden_states = self.moe_config.dp_group.all_gather( + hidden_states, 0) + router_logits = self.moe_config.dp_group.all_gather( + router_logits, 0) + + return hidden_states, router_logits + + def finalize(self, hidden_states: torch.Tensor, + reduce_results: bool) -> torch.Tensor: + """When DP size > 1, reduce-scatter the hidden states to get the final output. + + When TP size > 1, all-reduce the hidden states to get the final output. + """ + if self.moe_config.dp_size > 1: + hidden_states = data_parallel_reduce_scatter(hidden_states, dim=0) + hidden_states = hidden_states[:self.num_tokens] + + if reduce_results and (self.moe_config.tp_size > 1 + or self.moe_config.ep_size > 1): + hidden_states = tensor_model_parallel_all_reduce(hidden_states) + + return hidden_states + + def permute( + self, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + expert_map: torch.Tensor, # noqa: F841 + num_experts: int, + apply_a8_quantization: bool, + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]: + num_tokens = hidden_states.shape[0] + + self.topk_weights = topk_weights + self.topk_ids = topk_ids + + first_expert_idx = 0 + if expert_map is not None: + # FIXME: npu_grouped_matmul output random values at [num_valid_tokens:, ...] + # So we need to filter out invalid tokens by zeroing their weights. + # This is a workaround and should be removed after the issue is fixed + mask = expert_map[topk_ids] != -1 + # NOTE: This is equivalent to self.topk_weights[~mask] = 0.0, + # but ~mask will dispatch to aclnnNonzeroV2, which is not supported in ACL Graph + self.topk_weights = torch.where(mask, topk_weights, 0.0) + + first_expert_idx = self.moe_config.ep_rank * num_experts + last_expert_idx = first_expert_idx + num_experts + + permuted_hidden_states, expanded_row_idx, expert_tokens, _ = ( + torch_npu.npu_moe_init_routing_v2( + hidden_states, + topk_ids, + active_num=num_tokens * self.moe_config.experts_per_token, + expert_num=self.moe_config.num_experts, + expert_tokens_num_type=1, # Only support `count` mode now + expert_tokens_num_flag=True, # Output `expert_tokens` + active_expert_range=[first_expert_idx, last_expert_idx], + quant_mode=-1, + )) + self.expanded_row_idx = expanded_row_idx + permuted_hidden_states = permuted_hidden_states + + group_list_type = 1 # `count` mode + + return permuted_hidden_states, expert_tokens, None, group_list_type + + def unpermute(self, mlp_output: torch.Tensor, + hidden_states: torch.Tensor) -> None: + hidden_states[:] = torch_npu.npu_moe_token_unpermute( + permuted_tokens=mlp_output, + sorted_indices=self.expanded_row_idx, + probs=self.topk_weights) + + +class NativeAllGatherCommImpl(AllGatherCommImpl): + """This implementation should be compatible with all scenarios. + + Note that this implementation purely consists of native PyTorch ops + and does not use any NPU-specific ops. So the performance may not be optimal. + But it is a good fallback for scenarios where NPU-specific ops are not available. + """ + + def permute( + self, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + expert_map: torch.Tensor, + num_experts: int, + apply_a8_quantization: bool, + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]: + num_tokens = hidden_states.shape[0] + + # Generate token indices and flatten + token_indices = torch.arange(num_tokens, + device=hidden_states.device, + dtype=torch.int64) + token_indices = (token_indices.unsqueeze(1).expand( + -1, self.moe_config.experts_per_token).reshape(-1)) + + # Flatten token-to-expert mappings and map to local experts + weights_flat = topk_weights.view(-1) + experts_flat = topk_ids.view(-1) + local_experts_flat = (expert_map[experts_flat] + if expert_map is not None else experts_flat) + + # Filter valid token-expert pairs + mask = local_experts_flat != -1 + # FIXME: npu_grouped_matmul output random values at [num_valid_tokens:, ...] + # So we need to filter out invalid tokens by zeroing their weights. + # This is a workaround and should be removed after the issue is fixed + filtered_weights = torch.where(mask, weights_flat, + torch.zeros_like(weights_flat)).to( + topk_weights.dtype) + filtered_experts = torch.where( + mask, + local_experts_flat, + torch.full_like(local_experts_flat, num_experts), + ).to(topk_ids.dtype) + + # Sort by local expert IDs + sort_indices = torch.argsort(filtered_experts.view(torch.float32)) + self.sorted_token_indices = token_indices[sort_indices] + self.sorted_weights = filtered_weights[sort_indices] + + # Compute token counts with minlength of num_experts + # This is equivalent to but faster than: + # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1] + token_counts = torch.zeros(num_experts + 1, + device=hidden_states.device, + dtype=torch.int64) + ones = torch.ones_like(filtered_experts, dtype=torch.int64) + token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones) + expert_tokens = token_counts[:num_experts] + + # Rearrange hidden_states + permuted_hidden_states = hidden_states[self.sorted_token_indices] + + group_list_type = 1 # `count` mode + + return permuted_hidden_states, expert_tokens, None, group_list_type + + def unpermute(self, mlp_output: torch.Tensor, + hidden_states: torch.Tensor) -> None: + mlp_output = mlp_output * self.sorted_weights.unsqueeze(1) + + final_hidden_states = torch.zeros_like(hidden_states) + final_hidden_states.index_add_(0, self.sorted_token_indices, + mlp_output) + + hidden_states[:] = final_hidden_states + + +class MC2CommImpl(MoECommMethod): + """This implementation is for the scenarios listed below: + 1. `enable_expert_parallel=True`. + 2. `npu_moe_distribute_dispatch` and `npu_moe_distribute_combine` are available. + 3. `enable_expert_parallel=False` is not supported. + + This implementation uses the MC2 communication method, which is optimized for + Communication and Computation parallelism on Ascend devices. + """ + + def __init__(self, moe_config: Optional[FusedMoEConfig]): + super().__init__(moe_config) + + # NOTE: We do not need to use mc2_group's rank and world size + # because ep_group and mc2_group basically have the same init params. + # We only init another group because of the restriction of MC2: + # "No other groups can be used in the same process as the MC2 group." + self.mc2_comm_name = get_mc2_group().device_group._get_backend( + torch.device("npu")).get_hccl_comm_name(self.moe_config.ep_rank) + + # Feature flags + self.enable_dispatch_v2 = hasattr(torch_npu, + "npu_moe_distribute_dispatch_v2") + self.is_ascend_a3 = get_ascend_soc_version() == AscendSocVersion.A3 + self.need_extra_args = self.is_ascend_a3 + self._restore_tp_across_dp() + + def _restore_tp_across_dp(self): + # NOTE: Since vLLM flatten tp across dp, we need to restore the original + # tp_size and tp_rank. + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + def prepare( + self, hidden_states: torch.Tensor, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """The target_pad_length is calculated in forward_context, here we pad the + hidden states and router logits. And if TP size > 1, we also need to split + the tensors accordingly. + """ + self.num_tokens, _ = hidden_states.shape + forward_context = get_forward_context() + self.mc2_mask = forward_context.mc2_mask + target_pad_length = forward_context.padded_num_tokens + pad_size = target_pad_length - self.num_tokens + + if pad_size > 0: + hidden_states = nn.functional.pad(hidden_states, + (0, 0, 0, pad_size)) + router_logits = nn.functional.pad(router_logits, + (0, 0, 0, pad_size)) + + if self.tp_size > 1: + split_hidden_states = torch.tensor_split(hidden_states, + self.tp_size, + dim=0) + split_router_logits = torch.tensor_split(router_logits, + self.tp_size, + dim=0) + split_mc2_mask = torch.tensor_split(self.mc2_mask, + self.tp_size, + dim=0) + self.split_hidden_states = split_hidden_states + + hidden_states = split_hidden_states[self.tp_rank] + router_logits = split_router_logits[self.tp_rank] + self.mc2_mask = split_mc2_mask[self.tp_rank] + + return hidden_states, router_logits + + def finalize(self, hidden_states: torch.Tensor, + reduce_results: bool) -> torch.Tensor: + """If TP size > 1, all-gather the hidden states to get the final output. + + Also, unpad the hidden states if needed. + """ + if self.tp_size > 1: + dist.all_gather(list(self.split_hidden_states), hidden_states, + self.moe_config.tp_group.device_group) + hidden_states = torch.cat(self.split_hidden_states, dim=0) + + if self.num_tokens < hidden_states.shape[0]: + hidden_states = hidden_states[:self.num_tokens] + + return hidden_states + + def permute( + self, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + expert_map: torch.Tensor, + num_experts: int, + apply_a8_quantization: bool, + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]: + # Store tensors needed for post_process + self.topk_ids = topk_ids + self.topk_weights = topk_weights.to(torch.float32) + + dispatch_kwargs = { + "x": hidden_states, + "expert_ids": self.topk_ids, + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": self.moe_config.num_experts, + "global_bs": 0, + "scales": None, + "quant_mode": 2 if apply_a8_quantization else 0, + "group_ep": self.mc2_comm_name, + "ep_world_size": self.moe_config.ep_size, + "ep_rank_id": self.moe_config.ep_rank, + } + + if self.need_extra_args: + dispatch_kwargs.update({ + "group_tp": self.mc2_comm_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if self.is_ascend_a3 and self.enable_dispatch_v2: + dispatch_kwargs.update({ + "x_active_mask": self.mc2_mask, + }) + + dispatch = torch_npu.npu_moe_distribute_dispatch_v2 if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_dispatch + + ( + permuted_hidden_states, + dynamic_scale, + self.assist_info_for_combine, + expert_tokens, + self.ep_recv_counts, + self.tp_recv_counts, + ) = dispatch(**dispatch_kwargs)[:6] + + group_list_type = 1 + + return permuted_hidden_states, expert_tokens, dynamic_scale, group_list_type + + def unpermute(self, mlp_output: torch.Tensor, + hidden_states: torch.Tensor) -> None: + combine_kwargs = { + "expand_x": mlp_output, + "expert_ids": self.topk_ids, + "expert_scales": self.topk_weights, + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": self.moe_config.num_experts, + "global_bs": 0, + "ep_send_counts": self.ep_recv_counts, + "group_ep": self.mc2_comm_name, + "ep_world_size": self.moe_config.ep_size, + "ep_rank_id": self.moe_config.ep_rank, + } + + if self.enable_dispatch_v2: + combine_kwargs[ + "assist_info_for_combine"] = self.assist_info_for_combine + else: + combine_kwargs["expand_idx"] = self.assist_info_for_combine + + if self.need_extra_args: + combine_kwargs.update({ + "tp_send_counts": self.tp_recv_counts, + "group_tp": self.mc2_comm_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if self.is_ascend_a3 and self.enable_dispatch_v2: + combine_kwargs.update({ + "x_active_mask": self.mc2_mask, + }) + + combine = torch_npu.npu_moe_distribute_combine_v2 if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine + + hidden_states[:] = combine(**combine_kwargs) + + +class AlltoAllCommImpl(MoECommMethod): + """This implementation is for the scenarios listed below: + 1. `enable_expert_parallel=True`. + 2. `npu_grouped_matmul` is available. + + This implementation uses all-to-all communication to exchange tokens + between data parallel ranks before and after the MLP computation. It should + have better performance than AllGatherCommImpl when DP size > 1. + """ + + def __init__(self, moe_config: Optional[FusedMoEConfig]): + super().__init__(moe_config) + from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \ + get_token_dispatcher + self.token_dispatcher = get_token_dispatcher( + "TokenDispatcherWithAll2AllV") + self._restore_tp_across_dp() + + def _restore_tp_across_dp(self): + # NOTE: Since vLLM flatten tp across dp, we need to restore the original + # tp_size and tp_rank. + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + + def prepare( + self, hidden_states: torch.Tensor, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + self.num_tokens, _ = hidden_states.shape + pad_size = self.tp_size - self.num_tokens + + if pad_size > 0: + hidden_states = nn.functional.pad(hidden_states, + (0, 0, 0, pad_size)) + router_logits = nn.functional.pad(router_logits, + (0, 0, 0, pad_size)) + + if self.tp_size > 1: + split_hidden_states = torch.tensor_split(hidden_states, + self.tp_size, + dim=0) + split_router_logits = torch.tensor_split(router_logits, + self.tp_size, + dim=0) + self.split_hidden_states = split_hidden_states + + hidden_states = split_hidden_states[self.tp_rank] + router_logits = split_router_logits[self.tp_rank] + + return hidden_states, router_logits + + def finalize(self, hidden_states: torch.Tensor, + reduce_results: bool) -> torch.Tensor: + """If TP size > 1, all-gather the hidden states to get the final output. + + Also, unpad the hidden states if needed. + """ + if self.tp_size > 1: + dist.all_gather(list(self.split_hidden_states), hidden_states, + self.moe_config.tp_group.device_group) + hidden_states = torch.cat(self.split_hidden_states, dim=0) + + if self.num_tokens < hidden_states.shape[0]: + hidden_states = hidden_states[:self.num_tokens] + + return hidden_states + + def permute( + self, + hidden_states: torch.Tensor, + topk_ids: torch.Tensor, + topk_weights: torch.Tensor, + expert_map: torch.Tensor, + num_experts: int, + apply_a8_quantization: bool, + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], int]: + results = self.token_dispatcher.token_dispatch( + hidden_states, + topk_weights, + topk_ids, + None, + log2phy=None, + with_quant=apply_a8_quantization) + return results["hidden_states"], results["group_list"], results[ + "dynamic_scale"], results["group_list_type"] + + def unpermute(self, mlp_output: torch.Tensor, + hidden_states: torch.Tensor) -> None: + hidden_states[:] = self.token_dispatcher.token_combine(mlp_output) diff --git a/vllm_ascend/distributed/mooncake_connector.py b/vllm_ascend/distributed/mooncake_connector.py new file mode 100644 index 0000000..4faf37d --- /dev/null +++ b/vllm_ascend/distributed/mooncake_connector.py @@ -0,0 +1,1070 @@ +# SPDX-License-Identifier: Apache-2.0 +import contextlib +import hashlib +import math +import queue +import random +import struct +import threading +import time +from collections import defaultdict, deque +from collections.abc import Iterator +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, List, Optional, Tuple + +import msgspec +import numpy as np +import numpy.typing as npt +import torch +import zmq +from mooncake.engine import TransferEngine # type: ignore +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank, + get_tp_group) +from vllm.utils import get_ip, logger, make_zmq_path, make_zmq_socket +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.request import RequestStatus + +import vllm_ascend.envs as envs_ascend + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.request import Request + +GET_META_MSG = b"get_meta_msg" +DONE_RECVING_MSG = b"done_recving_msg" + + +class MooncakeAgentMetadata(msgspec.Struct, omit_defaults=True, dict=True): + engine_id: str + te_rpc_port: int + kv_caches_base_addr: list[int] + num_blocks: int + + +@dataclass +class ReqMeta: + local_block_ids: list[int] + remote_block_ids: list[int] + remote_host: str + remote_port: int + remote_engine_id: str + + +class KVCacheTaskTracker: + + def __init__(self): + super().__init__() + + self.done_task_lock = threading.Lock() + self.finished_requests: set[str] = set() + # Only used in prefill node. Tracks requests whose kv blocks freeing is + # intentionally delayed. Each entry is a tuple of (request_id, + # timestamp). If a request remains in this queue for too long, it will + # be force-freed. + self.delayed_free_requests: deque[Tuple[str, float]] = deque() + + def update_done_task_count(self, request_id: str): + with self.done_task_lock: + self.finished_requests.add(request_id) + self._remove_delayed_requests(request_id) + + def get_and_clear_finished_requests(self) -> set[str]: + """ + Get and clear the requests that have been completed. + Returns: + A set of request IDs that have been completed. + """ + with self.done_task_lock: + finished_requests = self.finished_requests.copy() + expired_requests = self._retrieve_expired_requests() + finished_requests.update(expired_requests) + self.finished_requests.clear() + return finished_requests + + def add_delayed_request(self, request_id: str, delay_start_time: float): + """Add a delayed free request.""" + with self.done_task_lock: + self.delayed_free_requests.append((request_id, delay_start_time)) + + def _retrieve_expired_requests(self): + """Retrieve all expired delayed requests.""" + expired_requests: set[str] = set() + # Free delayed requests if they exceed the timeout + current_time = time.time() + while self.delayed_free_requests: + request_id, delay_start_time = self.delayed_free_requests[0] + if (current_time - delay_start_time + > envs_ascend.VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT): + self.delayed_free_requests.popleft() + expired_requests.add(request_id) + logger.info("Force freed request: %s", request_id) + else: + break + return expired_requests + + def _remove_delayed_requests(self, request_id: str): + """Remove all delayed free requests matching the given request_id.""" + self.delayed_free_requests = deque( + (r, t) for r, t in self.delayed_free_requests if r != request_id) + + +class KVCacheSendingThread(threading.Thread): + + def __init__(self, tp_rank: int, decode_tp_size: int, local_engine_id: str, + side_channel_host: str, side_channel_port: int, + metadata: MooncakeAgentMetadata, + ready_event: threading.Event): + super().__init__(daemon=True, name="KVCacheSendingThread") + self.tp_rank = tp_rank + self.decode_tp_size = decode_tp_size + self.local_engine_id = local_engine_id + self.side_channel_host = side_channel_host + self.side_channel_port = side_channel_port + self.metadata = metadata + self.ready_event = ready_event + + self.task_tracker = KVCacheTaskTracker() + + def get_and_clear_finished_requests(self) -> set[str]: + """ + Get and clear the requests that have been completed. + Returns: + A set of request IDs that have been completed. + """ + return self.task_tracker.get_and_clear_finished_requests() + + def add_delayed_request(self, request_id: str, delay_start_time: float): + return self.task_tracker.add_delayed_request(request_id, + delay_start_time) + + def run(self): + """Run the thread to handle KV cache transfer requests.""" + + encoder = msgspec.msgpack.Encoder() + encoded_data = encoder.encode(self.metadata) + size_in_bytes = len(encoded_data) + logger.debug("Size of encoded MooncakeAgentMetadata: %s bytes", + str(size_in_bytes)) + + # Listen for new requests for metadata. + # NOTE(rob): we need each rank to have a unique port. This hack to keeps + # us moving. We will switch when moving to etcd or where we have a + # single ZMQ socket in the scheduler. + handshake_port = self.side_channel_port + self.tp_rank + path = make_zmq_path("tcp", self.side_channel_host, handshake_port) + logger.info("Starting listening on path: %s", path) + with zmq_ctx(zmq.ROUTER, path) as sock: # type: ignore + self.ready_event.set() + decoder = msgspec.msgpack.Decoder(type=tuple) + while True: + try: + frames = sock.recv_multipart() + if len(frames) < 2: + logger.error("Invalid message format: %s", frames) + continue + + identity = frames[0] + payload = [f for f in frames[1:] if f != b""] + if len(payload) != 1: + logger.error("Invalid message format: %s", frames) + continue + + msg = decoder.decode(payload[0]) + if msg[0] == GET_META_MSG: + sock.send_multipart((identity, b"", encoded_data)) + elif msg[0] == DONE_RECVING_MSG: + logger.debug("Got DONE_RECVING_MSG for request %s", + msg[1]) + request_id = msg[1] + self.task_tracker.update_done_task_count(request_id) + # Acknowledge the request completion. + while True: + try: + # Send ACK to the sender. + sock.send_multipart( + (identity, b"", b"ACK"), + flags=zmq.NOBLOCK) # type: ignore + break + except zmq.Again: # type: ignore + # If the socket is not ready, retry sending. + logger.debug( + "Socket not ready, retrying to send ACK for " + "request %s", msg[1]) + time.sleep(0.01) + else: + logger.error( + "Connection listener got unexpected message %s", + msg) + except Exception as e: + logger.error("Connection listener got exception %s: %s", + type(e), e) + + +class KVCacheRecvingThread(threading.Thread): + + def __init__(self, tp_rank: int, tp_size: int, engine: TransferEngine, + local_engine_id: str, local_handshake_port: int, + local_kv_caches_base_addr: list[int], block_len: list[int], + ready_event: threading.Event): + super().__init__(daemon=True, name="KVCacheRecvingThread") + self.tp_rank = tp_rank + self.tp_size = tp_size + + self.local_engine_id = local_engine_id + self.local_handshake_port = local_handshake_port + self.engine = engine + self.ready_event = ready_event + + self.kv_caches_base_addr: dict[str, dict[int, list[int]]] = \ + defaultdict(dict) + self.kv_caches_base_addr[local_engine_id][local_handshake_port] = \ + local_kv_caches_base_addr + self.remote_te_port: dict[str, dict[int, int]] = \ + defaultdict(dict) + self.block_len = block_len + # TODO(jianzs): find a better way to detect MLA. + self.use_mla = len(block_len) == 2 + + self.request_queue: queue.Queue[Any] = queue.Queue() + # TODO(jianzs): make this configurable + self.executor = ThreadPoolExecutor(max_workers=32) + + self.task_tracker = KVCacheTaskTracker() + + self.encoder = msgspec.msgpack.Encoder() + self.decoder = msgspec.msgpack.Decoder(MooncakeAgentMetadata) + self.remote_sockets_lock = threading.Lock() + self.remote_sockets: dict[ # type: ignore + str, deque[zmq.Socket]] = defaultdict( # type: ignore + deque) + self.remote_poller = zmq.Poller() # type: ignore + self.timeout = 1.0 # seconds + + def add_request(self, request_id: str, local_block_ids: list[int], + remote_block_ids: list[int], remote_engine_id: str, + remote_host: str, remote_handshake_port: int): + """Add a new request to the queue for processing.""" + logger.debug(f"Adding request {request_id} to the queue.") + self.request_queue.put({ + "request_id": request_id, + "local_block_ids": local_block_ids, + "remote_block_ids": remote_block_ids, + "remote_engine_id": remote_engine_id, + "remote_host": remote_host, + "remote_handshake_port": remote_handshake_port, + }) + + def get_and_clear_finished_requests(self) -> set[str]: + """ + Get and clear the requests that have been completed. + Returns: + A set of request IDs that have been completed. + """ + return self.task_tracker.get_and_clear_finished_requests() + + def run(self): + """Run the thread to handle KV cache transfer requests.""" + self.ready_event.set() + while True: + try: + request_data = self.request_queue.get() + if request_data is None: + logger.warning("Received a None request!") + self.request_queue.task_done() + continue + self._handle_request(request_data) + except Exception as e: + logger.error(f"Error in KVCacheTransferThread: {e}") + + def _handle_request(self, req_meta: dict[str, Any]): + request_id = req_meta["request_id"] + remote_host = req_meta["remote_host"] + remote_handshake_port = req_meta["remote_handshake_port"] + + try: + logger.debug( + f"Starting to transfer KV cache for request {request_id}.") + self._transfer_kv_cache(req_meta) + logger.debug( + f"Finished transferring KV cache for request {request_id}.") + except Exception as e: + logger.error("Failed to transfer KV cache for request " + f"{request_id}: {e}") + finally: + self.task_tracker.update_done_task_count(request_id) + # Always send the done signal to the remote host to ensure proper + # resource cleanup. Failing to do so may cause a memory leak on the + # remote host. + self._send_done_recv_signal(request_id, remote_host, + remote_handshake_port) + self.request_queue.task_done() + + def _transfer_kv_cache(self, req_meta: dict[str, Any]): + """Handle a KV cache transfer request.""" + request_id = req_meta["request_id"] + remote_block_ids = req_meta["remote_block_ids"] + local_block_ids = req_meta["local_block_ids"] + remote_engine_id = req_meta["remote_engine_id"] + remote_host = req_meta["remote_host"] + remote_handshake_port = req_meta["remote_handshake_port"] + + # Full prefix cache hit: do not need to read remote blocks, just notify + # P worker that we have the blocks we need. + if len(local_block_ids) == 0: + return + + # Check if we have the remote metadata cached. + if remote_engine_id not in self.kv_caches_base_addr or \ + remote_handshake_port not in self.kv_caches_base_addr[remote_engine_id]: + self._get_remote_metadata(remote_host, remote_handshake_port) + + grouped_remote_block_ids, grouped_local_block_ids = \ + group_concurrent_contiguous(remote_block_ids, local_block_ids) + remote_kv_caches_base_addrs = \ + self.kv_caches_base_addr[remote_engine_id][remote_handshake_port] + local_kv_caches_base_addrs = \ + self.kv_caches_base_addr[self.local_engine_id][self.local_handshake_port] + + req_start_time = time.perf_counter() + num_transfer_groups = len(grouped_remote_block_ids) + num_blocks = len(local_block_ids) + + remote_transfer_port = self.remote_te_port[remote_engine_id][ + remote_handshake_port] + session_id = f"{remote_host}:{remote_transfer_port}" + src_list, dst_list, length_list = [], [], [] + for k, (src_layer_base_addr, dst_layer_base_addr) in enumerate( + zip(local_kv_caches_base_addrs, remote_kv_caches_base_addrs)): + block_len = (self.block_len[k % 2] + if self.use_mla else self.block_len[0]) + for i, remote_block_id in enumerate(grouped_remote_block_ids): + local_block_ids = grouped_local_block_ids[i] + src = src_layer_base_addr + local_block_ids[0] * block_len + dst = dst_layer_base_addr + remote_block_id[0] * block_len + length = len(local_block_ids) * block_len + src_list.append(src) + dst_list.append(dst) + length_list.append(length) + ret = self.engine.batch_transfer_sync_read(session_id, src_list, + dst_list, length_list) + if ret < 0: + logger.error("Mooncake transfer failed for request %s", + req_meta["request_id"]) + raise RuntimeError(f"Mooncake transfer failed, ret: {ret}") + + req_end_time = time.perf_counter() + req_transfer_elapsed = (req_end_time - req_start_time) * 1000 + logger.info( + "KV cache transfer for request %s took %.2f ms (%d groups," + " %d blocks).", request_id, req_transfer_elapsed, + num_transfer_groups, num_blocks) + + def _get_remote_metadata(self, remote_host: str, + remote_handshake_port: int) -> None: + """Get the metadata from the remote host.""" + sock: Optional[zmq.Socket] = None # type: ignore + try: + sock = self._get_remote_socket(remote_host, remote_handshake_port) + ensure_zmq_send(sock, self.encoder.encode((GET_META_MSG, ""))) + metadata_bytes = ensure_zmq_recv(sock, self.remote_poller) + agent_meta = self.decoder.decode(metadata_bytes) + engine_id = agent_meta.engine_id + assert engine_id != self.local_engine_id, ( + f"Conflict engine id {engine_id} with local engine id " + f"{self.local_engine_id}.") + self.kv_caches_base_addr[engine_id][remote_handshake_port] = \ + agent_meta.kv_caches_base_addr + self.remote_te_port[engine_id][remote_handshake_port] = \ + agent_meta.te_rpc_port + finally: + if sock is not None: + self._return_remote_socket(sock, remote_host, + remote_handshake_port) + logger.debug("Returned socket to pool for %s:%d", remote_host, + remote_handshake_port) + + def _send_done_recv_signal(self, request_id: str, remote_host: str, + remote_handshake_port: int): + logger.debug("Sending done recving signal for request %s to %s:%d", + request_id, remote_host, remote_handshake_port) + sock: Optional[zmq.Socket] = None # type: ignore + try: + sock = self._get_remote_socket(remote_host, remote_handshake_port) + data_bytes = self.encoder.encode((DONE_RECVING_MSG, request_id)) + ensure_zmq_send(sock, data_bytes) + resp = ensure_zmq_recv(sock, + self.remote_poller, + timeout=self.timeout) + logger.debug( + f"Received response for request {request_id}: {resp.decode('utf-8')}" + ) + if resp != b"ACK": + logger.error("Failed to receive ACK for request %s from %s:%d", + request_id, remote_host, remote_handshake_port) + raise RuntimeError( + f"Failed to receive ACK, resp: {resp.decode('utf-8')}") + finally: + if sock is not None: + self._return_remote_socket(sock, remote_host, + remote_handshake_port) + logger.debug("Returned socket to pool for %s:%d", remote_host, + remote_handshake_port) + + def _get_remote_socket( + self, remote_host: str, + remote_handshake_port: int) -> zmq.Socket: # type: ignore + """Get a socket to the remote host.""" + remote_path = make_zmq_path("tcp", remote_host, remote_handshake_port) + with self.remote_sockets_lock: + if self.remote_sockets[remote_path]: + return self.remote_sockets[remote_path].popleft() + + ctx = zmq.Context() # type: ignore + sock = make_zmq_socket( + ctx=ctx, + path=remote_path, + socket_type=zmq.REQ, # type: ignore + bind=False) + sock.setsockopt( + zmq.SNDTIMEO, # type: ignore + int(self.timeout * 1000)) + self.remote_poller.register(sock, zmq.POLLIN) # type: ignore + return sock + + def _return_remote_socket( + self, + sock: zmq.Socket, # type: ignore + remote_host: str, + remote_handshake_port: int) -> None: + """Return the remote socket to the pool.""" + remote_path = make_zmq_path("tcp", remote_host, remote_handshake_port) + with self.remote_sockets_lock: + self.remote_sockets[remote_path].append(sock) + + +class MooncakeConnectorMetadata(KVConnectorMetadata): + + def __init__(self): + self.requests: dict[str, ReqMeta] = {} + self.requests_to_send: dict[str, float] = {} + + def add_new_req( + self, + request_id: str, + local_block_ids: list[int], + kv_transfer_params: dict[str, Any], + ): + self.requests[request_id] = ReqMeta( + local_block_ids=local_block_ids, + remote_block_ids=kv_transfer_params["remote_block_ids"], + remote_engine_id=kv_transfer_params["remote_engine_id"], + remote_host=kv_transfer_params["remote_host"], + remote_port=kv_transfer_params["remote_port"], + ) + + +class MooncakeConnector(KVConnectorBase_V1): + + def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): + assert vllm_config.kv_transfer_config is not None + self.engine_id = vllm_config.kv_transfer_config.engine_id + + if role == KVConnectorRole.SCHEDULER: + self.connector_scheduler: Optional[MooncakeConnectorScheduler] = \ + MooncakeConnectorScheduler(vllm_config, str(self.engine_id)) + self.connector_worker: Optional[MooncakeConnectorWorker] = None + elif role == KVConnectorRole.WORKER: + self.connector_scheduler = None + self.connector_worker = MooncakeConnectorWorker( + vllm_config, str(self.engine_id)) + + ############################################################ + # Scheduler Side Methods + ############################################################ + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_num_new_matched_tokens( + request, num_computed_tokens) + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + assert self.connector_scheduler is not None + return self.connector_scheduler.update_state_after_alloc( + request, blocks, num_external_tokens) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + assert self.connector_scheduler is not None + return self.connector_scheduler.build_connector_meta(scheduler_output) + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + assert self.connector_scheduler is not None + return self.connector_scheduler.request_finished(request, block_ids) + + def get_finished_count(self) -> Optional[int]: + assert self.connector_scheduler is not None + return self.connector_scheduler.get_finished_count() + + ############################################################ + # Worker Side Methods + ############################################################ + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + assert self.connector_worker is not None + self.connector_worker.register_kv_caches(kv_caches) + + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + """Get the finished recving and sending requests.""" + assert self.connector_worker is not None + return self.connector_worker.get_finished() + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + assert self.connector_worker is not None + assert isinstance(self._connector_metadata, MooncakeConnectorMetadata) + self.connector_worker.start_load_kv(self._connector_metadata) + + def wait_for_layer_load(self, layer_name: str) -> None: + """MooncakeConnector does not do layerwise saving.""" + pass + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + """MooncakeConnector does not save explicitly.""" + pass + + def wait_for_save(self): + """MooncakeConnector does not save explicitly.""" + pass + + +class MooncakeConnectorScheduler: + """Implementation of Scheduler side methods""" + + def __init__(self, vllm_config: VllmConfig, engine_id: str): + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + self.engine_id = engine_id + logger.info("Initializing Mooncake Scheduler %s", engine_id) + + self.side_channel_host = get_ip() + self.max_device_id = vllm_config.parallel_config.tensor_parallel_size * \ + vllm_config.parallel_config.data_parallel_size + + # Handshake base port + self.side_channel_port = ( + vllm_config.kv_transfer_config.kv_port + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) + + # Requests that need to start recv. + # New requests are added by update_state_after_alloc in + # the scheduler. Used to make metadata passed to Worker. + self._reqs_need_recv: dict[str, tuple[Request, list[int]]] = {} + self._reqs_need_send: dict[str, float] = {} + + def get_num_new_matched_tokens( + self, request: "Request", + num_computed_tokens: int) -> tuple[int, bool]: + """ + For remote prefill, pull all prompt blocks from remote + asynchronously relative to engine execution. + + Args: + request (Request): the request object. + num_computed_tokens (int): the number of locally + computed tokens for this request + Returns: + * the number of tokens that can be loaded from the + external KV cache beyond what is already computed. + * true if the external KV cache tokens will be loaded + asynchronously (between scheduler steps). + """ + + params = request.kv_transfer_params + logger.debug( + "MooncakeConnector get_num_new_matched_tokens: " + "num_computed_tokens=%s, kv_transfer_params=%s", + num_computed_tokens, params) + + if params is not None and params.get("do_remote_prefill"): + assert num_computed_tokens == 0, "Currently only support " \ + "prefill with num_computed_tokens == 0." + # Assume that the request's KV cache is already fully prefilled and + # can be fetched entirely from the prefill node. + count = max(len(request.prompt_token_ids) - 1, 0) + if count > 0: + return count, True + + # No remote prefill for this request. + return 0, False + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + + params = request.kv_transfer_params + logger.debug( + "MooncakeConnector update_state_after_alloc: " + "num_external_tokens=%s, kv_transfer_params=%s", + num_external_tokens, params) + + if params is not None and params.get("do_remote_prefill"): + if params.get("remote_block_ids"): + if all(p in params for p in ("remote_engine_id", "remote_host", + "remote_port")): + local_block_ids = (blocks.get_unhashed_block_ids() + if num_external_tokens > 0 else []) + # Get unhashed blocks to pull from remote. + self._reqs_need_recv[request.request_id] = ( + request, local_block_ids) + else: + logger.warning( + "Got invalid KVTransferParams: %s. This " + "request will not utilize KVTransfer", params) + else: + assert num_external_tokens == 0 + # Only trigger 1 KV transfer per request. + params["do_remote_prefill"] = False + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + meta = MooncakeConnectorMetadata() + + # Loop through scheduled reqs and convert to ReqMeta. + for req_id, (req, block_ids) in self._reqs_need_recv.items(): + assert req.kv_transfer_params is not None + # For the case where there are no remote blocks to pull + # (block_ids is empty), we don't need to schedule + # an async read on the worker side. + meta.add_new_req( + request_id=req_id, + local_block_ids=block_ids, + kv_transfer_params=req.kv_transfer_params, + ) + + # Clear the list once workers start the transfers + self._reqs_need_recv.clear() + meta.requests_to_send = self._reqs_need_send + self._reqs_need_send = {} + + return meta + + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + """ + Once a request is finished, determine whether request blocks + should be freed now or will be sent asynchronously and freed later. + """ + + params = request.kv_transfer_params + logger.debug( + "MooncakeConnector request_finished, request_status=%s, " + "kv_transfer_params=%s", request.status, params) + + if (params is None or not params.get("do_remote_decode") + or request.status != RequestStatus.FINISHED_LENGTH_CAPPED): + return False, None + + computed_block_ids = block_ids + delay_free_blocks = len(computed_block_ids) > 0 + if delay_free_blocks: + logger.info("Delaying free of %d blocks for request %s", + len(computed_block_ids), request.request_id) + self._reqs_need_send[request.request_id] = time.time() + + return delay_free_blocks, dict( + do_remote_prefill=True, + do_remote_decode=False, + remote_block_ids=computed_block_ids, + remote_engine_id=self.engine_id, + remote_host=self.side_channel_host, + remote_port=self.side_channel_port, + ) + + def get_finished_count(self) -> Optional[int]: + prefill_parallel_config: dict[ + str, + Any] = self.vllm_config.kv_transfer_config.get_from_extra_config( + "prefill", {}) + + assert "tp_size" in prefill_parallel_config.keys() + self._prefill_tp_size = prefill_parallel_config["tp_size"] + decode_parallel_config: dict[ + str, + Any] = self.vllm_config.kv_transfer_config.get_from_extra_config( + "decode", {}) + assert "tp_size" in decode_parallel_config.keys() + self._decode_tp_size = decode_parallel_config["tp_size"] + + if self.vllm_config.model_config.use_mla: + return self._decode_tp_size + else: + # TODO support mha and gqa + return None + + +class MooncakeConnectorWorker: + """Implementation of Worker side methods""" + + def __init__(self, vllm_config: VllmConfig, engine_id: str): + self._get_prefill_decode_size(vllm_config) + if self._prefill_tp_size < self._decode_tp_size: + raise ValueError( + f"prefill_tp_size: {self._prefill_tp_size} must be greater than" + f" or equal to the decode_tp_size: {self._decode_tp_size}") + + if TransferEngine is None: + raise RuntimeError("mooncake is not available") + logger.info("Initializing Mooncake work %s", engine_id) + self.engine = TransferEngine() + + # Metadata. + self.vllm_config = vllm_config + self.engine_id = engine_id + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = vllm_config.parallel_config.tensor_parallel_size + self.tp_group = get_tp_group() + self.dp_rank = vllm_config.parallel_config.data_parallel_rank_local + self.dp_size = vllm_config.parallel_config.data_parallel_size_local + self.kv_caches: dict[str, torch.Tensor] = {} + self.side_channel_host = get_ip() + self.max_device_id = self.tp_size * self.dp_size + self.kv_role = vllm_config.kv_transfer_config.kv_role + + # Handshake base port + self.side_channel_port = ( + vllm_config.kv_transfer_config.kv_port + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) + self.handshake_port = self.side_channel_port + self.tp_rank + self.sockets: dict = {} + + # get tp device id + # TODO(kw): https://github.com/vllm-project/vllm-ascend/pull/940 + # introducing some changes + device_ids_str = envs_ascend.PHYSICAL_DEVICES + if device_ids_str is None: + device_ids = list( + range(self.dp_rank * self.tp_size, + (self.dp_rank + 1) * self.tp_size)) + else: + device_ids = list(map(int, device_ids_str.split(','))) + start_index = self.dp_rank * self.tp_size + end_index = start_index + self.tp_size + if len(device_ids) < end_index: + raise ValueError( + f"Not enough physical devices available for DP rank {self.dp_rank}. " + f"Expected at least {end_index} devices, but found {len(device_ids)} " + "in PHYSICAL_DEVICES.") + device_ids = device_ids[start_index:end_index] + assert len(device_ids) > self.tp_rank # type: ignore + self.device_id = device_ids[self.tp_rank] # type: ignore + + self._initialize( + hostname=self.side_channel_host + ':' + '0' + ':' + 'npu_' \ + + str(self.device_id), + device_name=None) + self.te_rpc_port = self.engine.get_rpc_port() + + # Background thread for sending or receiving KV caches. + self.kv_send_thread: Optional[KVCacheSendingThread] = None + self.kv_recv_thread: Optional[KVCacheRecvingThread] = None + + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + + def _get_prefill_decode_size(self, vllm_config: VllmConfig): + # get prefill tp and dp size from extra config + prefill_parallel_config: dict[ + str, Any] = vllm_config.kv_transfer_config.get_from_extra_config( + "prefill", {}) + + assert "tp_size" in prefill_parallel_config.keys() + self._prefill_tp_size = prefill_parallel_config["tp_size"] + + assert "dp_size" in prefill_parallel_config.keys() + self._prefill_dp_size = prefill_parallel_config["dp_size"] + + # get decode tp and dp size from extra config + decode_parallel_config: dict[ + str, Any] = vllm_config.kv_transfer_config.get_from_extra_config( + "decode", {}) + assert "tp_size" in decode_parallel_config.keys() + self._decode_tp_size = decode_parallel_config["tp_size"] + assert "dp_size" in decode_parallel_config.keys() + self._decode_dp_size = decode_parallel_config["dp_size"] + + def _initialize( + self, + hostname: str, + device_name: Optional[str], + ) -> None: + """Initialize the mooncake instance.""" + device_name = device_name if device_name is not None else "" + ret_value = self.engine.initialize(hostname, "P2PHANDSHAKE", "ascend", + device_name) + if ret_value != 0: + raise RuntimeError( + f"Mooncake initialization failed with ret_value: {ret_value}") + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """Register the KV Cache data.""" + + _, first_kv_cache_tuple = next(iter(kv_caches.items())) + first_kv_cache = first_kv_cache_tuple[0] + + # TODO(tms): Find a more robust way to detect and handle MLA + self.use_mla = first_kv_cache_tuple[0].size( + -1) != first_kv_cache_tuple[1].size(-1) + if self.use_mla: + # MLA case.[num_block, block_size, 1, hidden_dim] + self.num_blocks = first_kv_cache.shape[0] + block_rank = 3 # [block_size, latent_dim] + block_shape_norm = first_kv_cache_tuple[0].shape[-block_rank:] + block_shape_pe = first_kv_cache_tuple[1].shape[-block_rank:] + self.block_len = [ + first_kv_cache[0].element_size() * math.prod(block_shape_norm), + first_kv_cache[1].element_size() * math.prod(block_shape_pe) + ] + logger.info( + "num_blocks: %s, block_shape_norm: %s, block_shape_pe: %s", + self.num_blocks, block_shape_norm, block_shape_pe) + else: + # [num_block, block_size, num_head, hidden_dim] + self.num_blocks = first_kv_cache.shape[0] + kv_elem_size = first_kv_cache.element_size() + block_rank = 3 # [block_size, kv_heads, head_dim] + block_shape = first_kv_cache.shape[-block_rank:] + self.block_len = [kv_elem_size * math.prod(block_shape)] + logger.info("num_blocks: %s, block_shape: %s", self.num_blocks, + block_shape) + + logger.info("Registering KV_Caches. use_mla: %s, shape %s", + self.use_mla, first_kv_cache.shape) + + self.kv_caches = kv_caches + kv_caches_base_addr = [] + for cache_or_caches in kv_caches.values(): + # Normalize to always be a list of caches + if self.use_mla: + for i, cache in enumerate(cache_or_caches, 0): + base_addr = cache.data_ptr() + region_len = self.num_blocks * self.block_len[i % 2] + kv_caches_base_addr.append(base_addr) + self._register(base_addr, region_len) + else: + cache_list = [cache_or_caches + ] if self.use_mla else cache_or_caches + for cache in cache_list: + base_addr = cache.data_ptr() + region_len = self.num_blocks * self.block_len[0] + kv_caches_base_addr.append(base_addr) + self._register(base_addr, region_len) + + # After KV Caches registered, start the sending or receiving thread. + metadata = MooncakeAgentMetadata( + engine_id=self.engine_id, + te_rpc_port=self.te_rpc_port, + kv_caches_base_addr=kv_caches_base_addr, + num_blocks=self.num_blocks, + ) + + ready_event = threading.Event() + if self.kv_role == 'kv_producer': + self.kv_send_thread = KVCacheSendingThread(self.tp_rank, + self._decode_tp_size, + self.engine_id, + self.side_channel_host, + self.side_channel_port, + metadata, ready_event) + self.kv_send_thread.start() + else: + self.kv_recv_thread = KVCacheRecvingThread( + self.tp_rank, self.tp_size, self.engine, self.engine_id, + self.handshake_port, kv_caches_base_addr, self.block_len, + ready_event) + self.kv_recv_thread.start() + ready_event.wait() + + def _register(self, ptr, length): + logger.info( + "Registering KV cache: ptr=0x%x, length=%d, num_blocks=%d, " + "block_lens=%s", ptr, length, self.num_blocks, self.block_len) + ret_value = self.engine.register_memory(ptr, length) + if ret_value != 0: + raise RuntimeError("Mooncake memory registration failed.") + + def get_finished(self) -> tuple[set[str], set[str]]: + done_sending = ( + self.kv_send_thread. + get_and_clear_finished_requests( # type: ignore[union-attr] + ) if self.kv_role == 'kv_producer' else set()) + done_recving = ( + self.kv_recv_thread. + get_and_clear_finished_requests( # type: ignore[union-attr] + ) if self.kv_role == 'kv_consumer' else set()) + if self.tp_rank == 0: + logger.debug( + "Number of completed KV cache send requests: %d, receive " + "requests: %d", len(done_sending), len(done_recving)) + return done_sending, done_recving + + def start_load_kv(self, metadata: MooncakeConnectorMetadata): + """Start loading KV blocks from remote engine.""" + for req_id, meta in metadata.requests.items(): + logger.debug( + "start_load_kv for request %s from remote engine %s. " + "Num local_block_ids: %s. Num remote_block_ids: %s. ", req_id, + meta.remote_engine_id, len(meta.local_block_ids), + len(meta.remote_block_ids)) + + remote_handshake_port = meta.remote_port + \ + self._get_remote_tp_rank(req_id) + self.kv_recv_thread.add_request( # type: ignore[union-attr] + request_id=req_id, + local_block_ids=meta.local_block_ids, + remote_block_ids=meta.remote_block_ids, + remote_engine_id=meta.remote_engine_id, + remote_host=meta.remote_host, + remote_handshake_port=remote_handshake_port, + ) + + if self.kv_send_thread is not None: + for req_id, delay_start_time in metadata.requests_to_send.items(): + if self.tp_rank in self._get_remote_tp_ranks_for_req(req_id): + self.kv_send_thread.add_delayed_request( + req_id, delay_start_time) + + def _get_remote_tp_rank(self, req_id: str) -> int: + return self._get_remote_tp_ranks_for_req(req_id)[self.tp_rank] + + def _get_remote_tp_ranks_for_req(self, req_id: str) -> list[int]: + if self._prefill_tp_size == self._decode_tp_size: + return list(range(self._prefill_tp_size)) + + seed = string_to_int64_hash(req_id) + rand = random.Random(seed) + sampled_nums = rand.sample(range(self._prefill_tp_size), + self._decode_tp_size) + return sampled_nums + + +@contextlib.contextmanager +def zmq_ctx(socket_type: Any, + addr: str) -> Iterator[zmq.Socket]: # type: ignore + """Context manager for a ZMQ socket""" + + if socket_type not in (zmq.ROUTER, zmq.REQ, zmq.DEALER): # type: ignore + raise ValueError(f"Unexpected socket type: {socket_type}") + + ctx: Optional[zmq.Context] = None # type: ignore + try: + ctx = zmq.Context() # type: ignore + yield make_zmq_socket(ctx=ctx, + path=addr, + socket_type=socket_type, + bind=socket_type == zmq.ROUTER) # type: ignore + finally: + if ctx is not None: + ctx.destroy(linger=0) + + +def group_concurrent_contiguous( + src: List[int], dst: List[int] +) -> Tuple[List[npt.NDArray[np.int64]], List[npt.NDArray[np.int64]]]: + """Vectorised NumPy implementation.""" + src_indices: npt.NDArray[np.int64] = np.array(src, dtype=np.int64) + dst_indices: npt.NDArray[np.int64] = np.array(dst, dtype=np.int64) + + if src_indices.size == 0: + return [], [] + + brk = np.where((np.diff(src_indices) != 1) + | (np.diff(dst_indices) != 1))[0] + 1 + src_groups = np.split(src_indices, brk) + dst_groups = np.split(dst_indices, brk) + + src_groups = [g.tolist() for g in src_groups] + dst_groups = [g.tolist() for g in dst_groups] + + return src_groups, dst_groups + + +def string_to_int64_hash(input_str): + """ + Hash the string using SHA-256 and convert it into an int64 integer. + """ + hashed_bytes = hashlib.sha256(input_str.encode("utf-8")).digest() + trunked_bytes = hashed_bytes[:8] + uint64_value = struct.unpack(" 0: + logger.warning( + f"Send failed: {e}, retrying... ({retries_left} " + "attempts left)") + time.sleep(0.1) + else: + logger.error(f"Send failed after all retries: {e}") + raise RuntimeError(f"Failed to send data after {max_retries} " + f"retries: {e}") + + +def ensure_zmq_recv( + socket: zmq.Socket, # type: ignore + poller: zmq.Poller, # type: ignore + timeout: float = 1.0, + max_retries: int = 3) -> bytes: + retries_left = max_retries + while True: + try: + if dict(poller.poll(int(timeout * 1000))): # milliseconds + data = socket.recv() + return data + else: + raise zmq.ZMQError("Receive timeout") # type: ignore + except zmq.ZMQError as e: # type: ignore + retries_left -= 1 + if retries_left > 0: + logger.warning(f"Receive failed: {e}, retrying... " + f"({retries_left} attempts left)") + time.sleep(0.1) + else: + logger.error(f"Receive failed after all retries: {e}") + raise RuntimeError( + f"Failed to receive data after {max_retries} " + f"retries: {e}") diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py new file mode 100644 index 0000000..f81d501 --- /dev/null +++ b/vllm_ascend/distributed/parallel_state.py @@ -0,0 +1,119 @@ +from typing import Optional + +import torch +from vllm.config import ParallelConfig +from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group, + init_model_parallel_group) + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config + +# Currently, mc2 op need their own group coordinator. +_MC2: Optional[GroupCoordinator] = None +_MLP_TP: Optional[GroupCoordinator] = None + +_LMTP: Optional[GroupCoordinator] = None + + +def get_mc2_group() -> GroupCoordinator: + assert _MC2 is not None, ("mc2 group is not initialized") + return _MC2 + + +def get_lmhead_tp_group() -> GroupCoordinator: + assert _LMTP is not None, ( + "lm head tensor parallel group is not initialized") + return _LMTP + + +def get_mlp_tp_group() -> GroupCoordinator: + assert _MLP_TP is not None, ("mlp group is not initialized") + return _MLP_TP + + +def model_parallel_initialized(): + return (_MC2 is not None) + + +def init_ascend_model_parallel(parallel_config: ParallelConfig, ): + if model_parallel_initialized(): + return + assert torch.distributed.is_initialized() + world_size = torch.distributed.get_world_size() + backend = torch.distributed.get_backend(get_world_group().device_group) + + # The layout of all ranks: ExternalDP * EP + # ExternalDP is the data parallel group that is not part of the model, + # every dp rank can generate independently (in verl integration). + all_ranks = torch.arange(world_size).reshape( + -1, parallel_config.data_parallel_size * + parallel_config.tensor_parallel_size) + global _MC2 + group_ranks = all_ranks.unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + + _MC2 = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + group_name="mc2") + if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE: + global _MLP_TP + assert _MLP_TP is None, ( + "mlp tensor model parallel group is already initialized") + + mlp_tp = parallel_config.data_parallel_size + + all_ranks_mlp_head = torch.arange(world_size).reshape( + -1, mlp_tp, parallel_config.pipeline_parallel_size, 1) # noqa + group_ranks = all_ranks_mlp_head.view(-1, mlp_tp).unbind(0) + group_ranks = [x.tolist() for x in group_ranks] + + # message queue broadcaster is only used in tensor model parallel group + _MLP_TP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + group_name="mlp_tp") + + lmhead_tensor_parallel_size = get_ascend_config( + ).lmhead_tensor_parallel_size + if lmhead_tensor_parallel_size is not None: + group_ranks = [] + global _LMTP + num_lmhead_tensor_parallel_groups: int = (world_size // + lmhead_tensor_parallel_size) + for i in range(num_lmhead_tensor_parallel_groups): + ranks = list( + range(i * lmhead_tensor_parallel_size, + (i + 1) * lmhead_tensor_parallel_size)) + group_ranks.append(ranks) + _LMTP = init_model_parallel_group(group_ranks, + get_world_group().local_rank, + backend, + group_name="lmheadtp") + + +def get_mlp_tensor_model_parallel_world_size(): + """Return world size for the tensor model parallel group.""" + return get_mlp_tp_group().world_size + + +def get_mlp_tensor_model_parallel_rank(): + """Return world size for the tensor model parallel group.""" + return get_mlp_tp_group().rank_in_group + + +def destroy_ascend_model_parallel(): + global _MC2 + if _MC2: + _MC2.destroy() + _MC2 = None + + global _MLP_TP + if _MLP_TP: + _MLP_TP.destroy() + _MLP_TP = None + + global _LMTP + if _LMTP: + _LMTP.destroy() + _LMTP = None diff --git a/vllm_ascend/distributed/tensor_parallel.py b/vllm_ascend/distributed/tensor_parallel.py new file mode 100644 index 0000000..3fff0a7 --- /dev/null +++ b/vllm_ascend/distributed/tensor_parallel.py @@ -0,0 +1,248 @@ +# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapts from: Megatron/megatron/core/tensor_parallel/mappings.py. +# This file is a part of the vllm-ascend project. +import torch + + +def _gather_along_first_dim(input_, group, output_split_sizes=None): + """Gather tensors and concatenate along the first dimension. + + Args: + input_tensor (torch.Tensor): + A tensor to be gathered. + output_split_sizes (List[int], optional): + A list specifying the sizes of the output splits along the first dimension. + If None, equal splitting is assumed. Default: None. + + Returns: + torch.Tensor: Gathered tensor. + """ + world_size = torch.distributed.get_world_size(group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + if output_split_sizes is None: + dim_size[0] = dim_size[0] * world_size + + output = torch.empty(dim_size, + dtype=input_.dtype, + device=torch.npu.current_device()) + torch.distributed.all_gather_into_tensor(output, + input_.contiguous(), + group=group) + else: + dim_size[0] = sum(output_split_sizes) + output = torch.empty(dim_size, + dtype=input_.dtype, + device=torch.npu.current_device()) + output_tensor_list = list( + torch.split(output, output_split_sizes, dim=0)) + torch.distributed.all_gather(output_tensor_list, input_, group=group) + + return output + + +def _gather_along_last_dim(input_, group): + """Gather tensors and concatenate along the last dimension.""" + + world_size = torch.distributed.get_world_size(group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + dim_size = list(input_.size()) + dim_size[0] = dim_size[0] * world_size + + output = torch.empty(dim_size, + dtype=input_.dtype, + device=torch.npu.current_device()) + torch.distributed.all_gather_into_tensor(output, + input_.contiguous(), + group=group) + tensor_list = output.chunk(world_size, dim=0) + output = torch.cat(tensor_list, dim=-1).contiguous() + + return output + + +def _reduce_scatter_along_first_dim(input_, + group, + input_split_sizes=None, + use_global_buffer=False): + """Reduce-scatter the input tensor across model parallel group. + + Args: + input_ (torch.Tensor): The input tensor to be reduce-scattered. + input_split_sizes (List[int], optional): A list specifying the sizes of + the input splits along the first dimension for each rank. If None, + equal splitting is assumed. Default: None. + """ + world_size = torch.distributed.get_world_size(group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input_ + + if input_split_sizes is None: + dim_size = list(input_.size()) + assert ( + dim_size[0] % world_size == 0 + ), "First dimension of the tensor should be divisible by tensor parallel size" + + dim_size[0] = dim_size[0] // world_size + + output = torch.empty(dim_size, + dtype=input_.dtype, + device=torch.npu.current_device()) + torch.distributed.reduce_scatter_tensor(output, + input_.contiguous(), + group=group) + else: + rank = torch.distributed.get_rank(group) + input_tensor_list = list(torch.split(input_, input_split_sizes, dim=0)) + + output = torch.empty_like(input_tensor_list[rank]) + torch.distributed.reduce_scatter(output, + input_tensor_list, + group=group) + return output + + +def _reduce_scatter_along_last_dim(input_, group): + """Reduce-scatter tensors on the last dimension.""" + world_size = torch.distributed.get_world_size(group) + target_shape = list(input_.size()) + target_shape[-1] = target_shape[-1] // world_size + input_ = input_.reshape(-1, input_.shape[-1]) + split_tensors = torch.split(input_, + split_size_or_sections=input_.shape[-1] // + world_size, + dim=1) + concat_tensor = torch.cat(split_tensors, dim=0) + output = _reduce_scatter_along_first_dim(concat_tensor, + group).reshape(target_shape) + return output + + +def all_gather_last_dim_from_tensor_parallel_region(input_, group): + """Wrapper for autograd function: forward: AG, backward RS """ + return _gather_along_last_dim(input_, group) + + +def reduce_scatter_to_sequence_parallel_region(input_, + group, + input_split_sizes=None): + """Wrapper for autograd function: forward: RS, backward AG """ + return _reduce_scatter_along_first_dim(input_, group, input_split_sizes) + + +def reduce_scatter_last_dim_to_tensor_parallel_region(input_, group): + """Wrapper for autograd function: forward: RS, backward AG: AG """ + return _reduce_scatter_along_last_dim(input_, group) + + +def gather_from_sequence_parallel_region( + input_, + group, + output_split_sizes=None, +): + """Wrapper for autograd function: forward: AG, backward: RS """ + return _gather_along_first_dim(input_, group, output_split_sizes) + + +def all_to_all(group, input, output_split_sizes=None, input_split_sizes=None): + world_size = torch.distributed.get_world_size(group=group) + # Bypass the function if we are using only 1 GPU. + if world_size == 1: + return input + + input = input.contiguous() + if output_split_sizes is None: + # Equal split (all2all) + output = torch.empty_like(input) + else: + # Unequal split (all2all-v) + output = input.new_empty( + size=[sum(output_split_sizes)] + list(input.size()[1:]), + dtype=input.dtype, + device=torch.npu.current_device(), + ) + torch.distributed.all_to_all_single( + output, + input, + output_split_sizes=output_split_sizes, + input_split_sizes=input_split_sizes, + group=group, + ) + return output + + +def all_to_all_sp2hp(input_, group): + """ + Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape + [num_tokens/TP, H] to [num_tokens, H/TP]. + + Args: + input_ (torch.Tensor): + The input tensor which has been distributed along the sequence + dimension. + + Returns: + torch.Tensor: The output tensor with shape [num_tokens, H/TP]. + + """ + if group is None: + return input_ + world_size = torch.distributed.get_world_size(group=group) + tp_group = group + input_ = input_.reshape(-1, input_.shape[-1]) + split_tensors = torch.split(input_, + split_size_or_sections=input_.shape[-1] // + world_size, + dim=1) + concat_tensor = torch.cat(split_tensors, dim=0) + output = all_to_all(tp_group, concat_tensor) + return output + + +def all_to_all_hp2sp(input_, group): + """ + Perform AlltoAll communication on tensor parallel group, transform the input tensor from shape + [num_tokens, H/TP] to [num_tokens/TP, H]. + + Args: + input_ (torch.Tensor): + The input tensor which has been distributed along the hidden + dimension. + + Returns: + torch.Tensor: The output tensor with shape [num_tokens/TP, H]. + """ + if group is None: + return input_ + world_size = torch.distributed.get_world_size(group=group) + input_ = input_.reshape(-1, input_.shape[-1]) + tp_group = group + input_exchanged = all_to_all(tp_group, input_) + input_reshaped = input_exchanged.reshape(-1, input_exchanged.shape[-1]) + split_tensors = torch.split( + input_reshaped, + split_size_or_sections=input_reshaped.shape[0] // world_size, + dim=0) + output = torch.cat(split_tensors, dim=-1) + return output diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py new file mode 100644 index 0000000..78f8c50 --- /dev/null +++ b/vllm_ascend/envs.py @@ -0,0 +1,160 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# This file is mainly Adapted from vllm-project/vllm/vllm/envs.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from typing import Any, Callable, Dict + +# The begin-* and end* here are used by the documentation generator +# to extract the used env vars. + +# begin-env-vars-definition + +env_variables: Dict[str, Callable[[], Any]] = { + # max compile thread number for package building. Usually, it is set to + # the number of CPU cores. If not set, the default value is None, which + # means all number of CPU cores will be used. + "MAX_JOBS": + lambda: os.getenv("MAX_JOBS", None), + # The build type of the package. It can be one of the following values: + # Release, Debug, RelWithDebugInfo. If not set, the default value is Release. + "CMAKE_BUILD_TYPE": + lambda: os.getenv("CMAKE_BUILD_TYPE"), + # Whether to compile custom kernels. If not set, the default value is True. + # If set to False, the custom kernels will not be compiled. Please note that + # the sleep mode feature will be disabled as well if custom kernels are not + # compiled. + "COMPILE_CUSTOM_KERNELS": + lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))), + # The CXX compiler used for compiling the package. If not set, the default + # value is None, which means the system default CXX compiler will be used. + "CXX_COMPILER": + lambda: os.getenv("CXX_COMPILER", None), + # The C compiler used for compiling the package. If not set, the default + # value is None, which means the system default C compiler will be used. + "C_COMPILER": + lambda: os.getenv("C_COMPILER", None), + # The version of the Ascend chip. If not set, the default value is + # ASCEND910B1(Available for A2 and A3 series). It's used for package building. + # Please make sure that the version is correct. + "SOC_VERSION": + lambda: os.getenv("SOC_VERSION", "ASCEND910B1"), + # If set, vllm-ascend will print verbose logs during compilation + "VERBOSE": + lambda: bool(int(os.getenv('VERBOSE', '0'))), + # The home path for CANN toolkit. If not set, the default value is + # /usr/local/Ascend/ascend-toolkit/latest + "ASCEND_HOME_PATH": + lambda: os.getenv("ASCEND_HOME_PATH", None), + # The path for HCCL library, it's used by pyhccl communicator backend. If + # not set, the default value is libhccl.so。 + "HCCL_SO_PATH": + lambda: os.environ.get("HCCL_SO_PATH", None), + # The version of vllm is installed. This value is used for developers who + # installed vllm from source locally. In this case, the version of vllm is + # usually changed. For example, if the version of vllm is "0.9.0", but when + # it's installed from source, the version of vllm is usually set to "0.9.1". + # In this case, developers need to set this value to "0.9.0" to make sure + # that the correct package is installed. + "VLLM_VERSION": + lambda: os.getenv("VLLM_VERSION", None), + # Whether to enable the trace recompiles from pytorch. + "VLLM_ASCEND_TRACE_RECOMPILES": + lambda: bool(int(os.getenv("VLLM_ASCEND_TRACE_RECOMPILES", '0'))), + # Whether to enable fused_experts_allgather_ep. MoeInitRoutingV3 and + # GroupedMatmulFinalizeRouting operators are combined to implement EP. + "VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP": + lambda: bool(int(os.getenv("VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP", '0')) + ), + # Whether to enable DBO feature for deepseek model. + "VLLM_ASCEND_ENABLE_DBO": + lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_DBO", '0'))), + # Whether to enable the model execute time observe profile. Disable it when + # running vllm ascend in production environment. + "VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE": + lambda: bool(int(os.getenv("VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE", '0')) + ), + # Some models are optimized by vllm ascend. While in some case, e.g. rlhf + # training, the optimized model may not be suitable. In this case, set this + # value to False to disable the optimized model. + "USE_OPTIMIZED_MODEL": + lambda: bool(int(os.getenv('USE_OPTIMIZED_MODEL', '1'))), + # The tolerance of the kv cache size, if the difference between the + # actual kv cache size and the cached kv cache size is less than this value, + # then the cached kv cache size will be used. + "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE": + lambda: int( + os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", 64)), + # Whether to enable the topk optimization. It's enabled by default. Please set to False if you hit any issue. + # We'll remove this flag in the future once it's stable enough. + "VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION": + lambda: bool( + int(os.getenv("VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION", '1'))), + # `LLMDataDistCMgrConnector` required variable. `DISAGGREGATED_PREFILL_RANK_TABLE_PATH` is + # used for llmdatadist to build the communication topology for kv cache transfer, it is + # a required variable if `LLMDataDistCMgrConnector` is used as kv connector for disaggregated + # pd. The rank table can be generated by adopting the script `gen_ranktable.sh` + # in vllm_ascend's example folder. + "DISAGGREGATED_PREFILL_RANK_TABLE_PATH": + lambda: os.getenv("DISAGGREGATED_PREFILL_RANK_TABLE_PATH", None), + # `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_IP` is used as the + # rpc communication listening ip, which will be used to receive the agent metadata from the + # remote worker. + "VLLM_ASCEND_LLMDD_RPC_IP": + lambda: os.getenv("VLLM_ASCEND_LLMDD_RPC_IP", "0.0.0.0"), + # `LLMDataDistCMgrConnector` required variable. `VLLM_ASCEND_LLMDD_RPC_PORT` is used as the + # rpc communication listening port, which will be used to receive the agent metadata from the + # remote worker. + "VLLM_ASCEND_LLMDD_RPC_PORT": + lambda: int(os.getenv("VLLM_ASCEND_LLMDD_RPC_PORT", 5557)), + # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible + # and the mla_pa will be the default path of deepseek decode path. + "VLLM_ASCEND_MLA_PA": + lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)), + # Whether to enable MatmulAllReduce fusion kernel when tensor parallel is enabled. + # this feature is supported in A2, and eager mode will get better performance. + "VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE": + lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE", '0'))), + # Whether to enable mlp optimize when tensor parallel is enabled. + # this feature in eager mode will get better performance. + "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE": + lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))), + # Determine the number of physical devices in a non-full-use scenario + # caused by the initialization of the Mooncake connector. + "PHYSICAL_DEVICES": + lambda: os.getenv("PHYSICAL_DEVICES", None), + # Timeout (in seconds) for delayed KVCache block release. In the prefill + # node, if a request is marked for delayed KV block release and the blocks + # are not freed within this timeout, they will be forcibly released. + "VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT": + lambda: int(os.getenv("VLLM_ASCEND_KVCACHE_DELAY_FREE_TIMEOUT", 250)), +} + +# end-env-vars-definition + + +def __getattr__(name: str): + # lazy evaluation of environment variables + if name in env_variables: + return env_variables[name]() + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + return list(env_variables.keys()) diff --git a/vllm_ascend/lora/__init__.py b/vllm_ascend/lora/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/lora/punica_wrapper/__init__.py b/vllm_ascend/lora/punica_wrapper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/lora/punica_wrapper/lora_ops.py b/vllm_ascend/lora/punica_wrapper/lora_ops.py new file mode 100644 index 0000000..e8bf8ad --- /dev/null +++ b/vllm_ascend/lora/punica_wrapper/lora_ops.py @@ -0,0 +1,112 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + + +def bgmv_shrink(inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0): + return torch.ops._C.bgmv_shrink( + inputs, + lora_a_weights, + lora_indices_tensor, + output_tensor, + scaling, + ) + + +def bgmv_expand(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True): + return torch.ops._C.bgmv_expand( + inputs, + lora_b_weights, + lora_indices_tensor, + output_tensor, + 0, + output_tensor.size(1), + ) + + +def bgmv_expand_slice(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True): + return torch.ops._C.bgmv_expand(inputs, lora_b_weights, + lora_indices_tensor, output_tensor, + slice_offset, slice_size) + + +def sgmv_shrink( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + scaling: float, +): + return torch.ops._C.sgmv_shrink(inputs, lora_a_weights, + lora_indices_tensor, seq_len_tensor, + output_tensor, scaling) + + +def sgmv_expand(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + add_inputs: bool = False): + return torch.ops._C.sgmv_expand( + inputs, + lora_b_weights, + lora_indices_tensor, + seq_len_tensor, + output_tensor, + 0, + output_tensor.size(1), + ) + + +def sgmv_expand_slice(inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False): + return torch.ops._C.sgmv_expand(inputs, lora_b_weights, + lora_indices_tensor, seq_len_tensor, + output_tensor, slice_offset, slice_size) diff --git a/vllm_ascend/lora/punica_wrapper/punica_npu.py b/vllm_ascend/lora/punica_wrapper/punica_npu.py new file mode 100644 index 0000000..a85c837 --- /dev/null +++ b/vllm_ascend/lora/punica_wrapper/punica_npu.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Callable, Optional, Tuple, Union + +import torch + +from vllm_ascend.utils import is_310p + +if is_310p(): + from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice, + bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) +else: + from vllm_ascend.lora.punica_wrapper.lora_ops import ( + bgmv_expand, bgmv_expand_slice, bgmv_shrink, sgmv_expand, + sgmv_expand_slice, sgmv_shrink) + +from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase + + +# The platforms that are compatible with the PyTorch-native implementation can +# inherit this class +class PunicaWrapperNPU(PunicaWrapperBase): + """ + PunicaWrapperNPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the pytorch punica ops. + """ + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: Union[torch.device, str], **kwargs): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, + device) + + def _shrink_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_shrink( + x, + w_t_all, + y, + *self.prefill_metadata, + scale, + ) + + def _shrink_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + + def _expand_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand( + x, + w_t_all, + y, + *self.prefill_metadata, + add_inputs, + ) + + def _expand_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_inputs: bool, + ): + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_inputs) + + def _expand_slice_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand_slice( + x, + w_t_all, + y, + *self.prefill_metadata, + y_offset, + y_slice_size, + add_inputs, + ) + + def _expand_slice_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool, + ): + bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, + y_slice_size, add_inputs) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: int, + y_slice_size: int, + add_inputs: bool = True, + ): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + computation, which is suitable for the + GEMM of lora'b. + """ + + expand_slice_fun: Callable = (self._expand_slice_prefill + if self.is_prefill else + self._expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_inputs) + + def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, + w_t_all: torch.Tensor, scale: float): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + shrink_fun: Callable = (self._shrink_prefill + if self.is_prefill else self._shrink_decode) + shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) + + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, **kwargs): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) + + def add_expand(self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_inputs=True, + **kwargs) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_inputs (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = offset_start + if lora_bias_stacked is not None: + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + for slice_idx in range(len(lora_b_stacked)): + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs, + ) + offset_left += output_slices[slice_idx] + y = y.view_as(y_org) + + def add_lora_embedding(self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_inputs: bool = True, + **kwargs) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_inputs (bool): Default to True. + """ + + # Embedding layer only need expand op + expand_fun: Callable = (self._expand_prefill + if self.is_prefill else self._expand_decode) + expand_fun(y, x, lora_b_stacked, add_inputs) + + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default, consistent with the + # triton op + buffer = tuple( + torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices))) + self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_inputs=True, + **kwargs) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + + if lora_a_stacked.dim() == 2: + lora_a_stacked = lora_a_stacked.unsqueeze(0) + if lora_b_stacked.dim() == 2: + lora_b_stacked = lora_b_stacked.unsqueeze(0) + + r = lora_a_stacked.size(-1) + + if buffer is None: + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + + indices = self.sampler_indices + if indices.max() >= lora_a_stacked.size(0): + indices = torch.clamp(indices, 0, lora_a_stacked.size(0) - 1) + + lora_a_reshaped = lora_a_stacked.transpose(1, 2) + lora_b_reshaped = lora_b_stacked.transpose(1, 2) + + bgmv_shrink(x, lora_a_reshaped, buffer, indices, scale) + bgmv_expand(buffer, lora_b_reshaped, y, indices, add_inputs=True) + + y = y.view_as(y_org) diff --git a/vllm_ascend/meta_registration.py b/vllm_ascend/meta_registration.py new file mode 100644 index 0000000..47c7758 --- /dev/null +++ b/vllm_ascend/meta_registration.py @@ -0,0 +1,104 @@ +import torch +from torch.library import Library + +# This file provides a template and registration utilities for writing "meta" implementations +# of custom operators in Python for the vllm_ascend project. +# +# We offer two ways to implement meta implementations for custom ops: +# 1. Python meta implementation (as shown in this file): Write a Python function that +# takes the same arguments as your operator and returns empty tensors with the correct +# shapes and dtypes. This is useful for rapid prototyping and for ops that are only +# used in Python. +# 2. C++ meta implementation: You can also implement the meta function in C++ for better +# performance or to match the C++ op logic more closely. See `torch_binding_meta.cpp` +# for examples of C++ meta implementations and how to register them. +# +# Both approaches enable tracing, export, and shape inference in PyTorch and vLLM, which +# is essential for supporting `torch.compile` and aclgraph. + +# How to add a new meta implementation in Python: +# ------------------------------------- +# 1. Write a Python function that takes the same arguments as your operator, and returns +# empty tensors (using torch.empty_like, torch.empty, etc.) with the correct shapes and dtypes. +# Do NOT perform any real computation or allocate device memory. +# +# 2. Register your meta function using `register_meta_if_necessary`, providing: +# - The namespace (usually "_C" for custom ops) +# - The operator name (as registered in C++) +# - The Python meta function +# - (Optional) The overload name, if your op has overloads +# +# 3. The registration utility will check if a meta implementation already exists for your op, +# and only register if necessary. This avoids duplicate registrations. +# +# 4. Example meta implementations are provided below for rotary_embedding and get_masked_input_and_mask. +# +# 5. When developing new custom ops, always provide a meta implementation to enable tracing, +# export, and shape inference in PyTorch and vLLM to enable the capture of `torch.compile` +# and aclgraph. +# +# For more details, see: https://pytorch.org/docs/stable/notes/extending.html#meta-tensors + +lib = Library("_C", "IMPL") + + +def register_meta_if_necessary(ns: str, op_name: str, fn, overload: str = ""): + if overload != "": + op_name = op_name + "." + overload + schema_to_find = ns + "::" + op_name + meta_impl_list = torch._C._dispatch_get_registrations_for_dispatch_key( + "Meta") + if schema_to_find in meta_impl_list: + return + lib.impl(op_name, fn, "Meta") + + +def rotary_embedding_meta(positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor, head_size: int, + cos_sin_cache: torch.Tensor, is_neox: bool): + + num_tokens = positions.numel() + query_hidden_size = query.numel() // num_tokens + key_hidden_size = key.numel() // num_tokens + num_heads = query_hidden_size // head_size + num_kv_heads = key_hidden_size // head_size + + query_dst = torch.empty_like(query).view(num_tokens, num_heads, head_size) + key_dst = torch.empty_like(key).view(num_tokens, num_kv_heads, head_size) + return query_dst, key_dst + + +def get_masked_input_and_mask_meta(input: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int): + + masked_input = torch.empty_like(input) + mask = torch.empty_like(input).to(torch.bool) + + return masked_input, mask + + +def bgmv_expand_meta(x: torch.Tensor, weight: torch.Tensor, + indices: torch.Tensor, y: torch.Tensor, slice_offset: int, + slice_size: int): + + y_out = torch.empty_like(y) + return y_out + + +def sgmv_expand_meta(x: torch.Tensor, weight: torch.Tensor, + lora_indices: torch.Tensor, seq_len: torch.Tensor, + y: torch.Tensor, slice_offset: int, slice_size: int): + + y_out = torch.empty_like(y) + return y_out + + +register_meta_if_necessary("_C", "rotary_embedding", rotary_embedding_meta) +register_meta_if_necessary("_C", "get_masked_input_and_mask", + get_masked_input_and_mask_meta) +register_meta_if_necessary("_C", "bgmv_expand", bgmv_expand_meta) +register_meta_if_necessary("_C", "sgmv_expand", sgmv_expand_meta) diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py new file mode 100644 index 0000000..dfb47fe --- /dev/null +++ b/vllm_ascend/models/__init__.py @@ -0,0 +1,61 @@ +from vllm import ModelRegistry + +import vllm_ascend.envs as envs_ascend + + +def register_model(): + from .deepseek_dbo import CustomDeepseekDBOForCausalLM # noqa: F401 + from .deepseek_mtp import CustomDeepSeekMTP # noqa: F401 + from .deepseek_v2 import CustomDeepseekV2ForCausalLM # noqa: F401 + from .deepseek_v3 import CustomDeepseekV3ForCausalLM # noqa: F401 + from .qwen2_5_vl import \ + AscendQwen2_5_VLForConditionalGeneration # noqa: F401 + from .qwen2_vl import AscendQwen2VLForConditionalGeneration # noqa: F401 + from .qwen3 import CustomQwen3ForCausalLM # noqa: F401 + + ModelRegistry.register_model( + "DeepSeekMTPModel", + "vllm_ascend.models.deepseek_mtp:CustomDeepSeekMTP") + + ModelRegistry.register_model( + "Qwen2VLForConditionalGeneration", + "vllm_ascend.models.qwen2_vl:AscendQwen2VLForConditionalGeneration") + + if envs_ascend.USE_OPTIMIZED_MODEL: + ModelRegistry.register_model( + "Qwen2_5_VLForConditionalGeneration", + "vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration" + ) + else: + ModelRegistry.register_model( + "Qwen2_5_VLForConditionalGeneration", + "vllm_ascend.models.qwen2_5_vl_without_padding:AscendQwen2_5_VLForConditionalGeneration_Without_Padding" + ) + + if envs_ascend.VLLM_ASCEND_ENABLE_DBO: + ModelRegistry.register_model( + "DeepseekV2ForCausalLM", + "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM") + + ModelRegistry.register_model( + "DeepseekV3ForCausalLM", + "vllm_ascend.models.deepseek_dbo:CustomDeepseekDBOForCausalLM") + else: + ModelRegistry.register_model( + "DeepseekV2ForCausalLM", + "vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM") + + ModelRegistry.register_model( + "DeepseekV3ForCausalLM", + "vllm_ascend.models.deepseek_v3:CustomDeepseekV3ForCausalLM") + + ModelRegistry.register_model( + "Qwen3MoeForCausalLM", + "vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM") + + ModelRegistry.register_model( + "Qwen3ForCausalLM", "vllm_ascend.models.qwen3:CustomQwen3ForCausalLM") + + ModelRegistry.register_model( + "PanguProMoEForCausalLM", + "vllm_ascend.models.pangu_moe:PanguProMoEForCausalLM") diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py new file mode 100644 index 0000000..9469e99 --- /dev/null +++ b/vllm_ascend/models/deepseek_dbo.py @@ -0,0 +1,1046 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Adapted from +# # vllm-project/vllm/blob/main/vllm/model_executor/models/deepseek_v2.py +# # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py +# """Inference-only DeepseekV2/DeepseekV3 model.""" + +from typing import Any, Dict, Iterable, List, Optional, Union + +import torch +import torch.distributed as dist +import torch_npu # noqa: F401 +from torch import nn +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.distributed import (get_pp_group, + get_tensor_model_parallel_world_size, + get_tp_group, tensor_model_parallel_all_reduce) +from vllm.distributed.parallel_state import get_dp_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + ReplicatedLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import \ + DeepseekV2ForCausalLM # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import \ + yarn_get_mscale # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import ( + DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention, + get_spec_layer_idx_from_weight_name) +from vllm.model_executor.models.utils import ( + PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.sequence import IntermediateTensors + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.models.deepseek_v2 import (CustomDeepseekV2MLP, + CustomDeepseekV2RowParallelLinear) +from vllm_ascend.multistream.base import MSEventKey +from vllm_ascend.multistream.context import ( + advance_step_multistream_layer_context, get_multistream_comm_context, + get_multistream_layer_context, set_multistream_context) +from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer, + MultiStreamPreTransformerLayer) +from vllm_ascend.multistream.metadata import (MultiStreamConfig, + MultiStreamStepMetadata, + make_multistream_metadata_ds) +from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.utils import dispose_tensor + +VLLM_ASCEND_ENABLE_DBO: bool = envs_ascend.VLLM_ASCEND_ENABLE_DBO + + +class CustomDeepseekDBOMLP(CustomDeepseekV2MLP): + + def _forward_ms_mlp(self, x): + current_ms_metadata = get_multistream_comm_context() + assert current_ms_metadata is not None + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + x, _ = self.down_proj(x) + current_ms_metadata.after_comm_event.record() + return x + + +class CustomDeepseekDBOMoE(nn.Module): + + top_k: int + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + self.n_shared_experts = config.n_shared_experts + self.routed_scaling_factor = config.routed_scaling_factor + if self.tp_size > config.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.n_routed_experts}.") + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + if config.topk_method == "noaux_tc": + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts)) + else: + self.gate.e_score_correction_bias = None + + self.experts = AscendFusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + e_score_correction_bias=self.gate.e_score_correction_bias) + + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + self.shared_experts = CustomDeepseekDBOMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=True, + prefix=f"{prefix}.shared_experts", + ) + CustomDeepseekDBOMoE.top_k = config.num_experts_per_tok + + self.dp_size = get_dp_group().world_size + + self.tp_group = get_tp_group().device_group + self.tp_rank = get_tp_group().rank_in_group + + self.params_dtype = torch.get_default_dtype() + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + enable_force_load_balance = forward_context.in_profile_run + + is_prefill = forward_context.with_prefill + + old_hidden_states = hidden_states.clone() + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + + hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=CustomDeepseekDBOMoE.top_k, + enable_force_load_balance=enable_force_load_balance, + ) * self.routed_scaling_factor + + if self.n_shared_experts is not None: + shared_output = self.shared_experts(old_hidden_states) + + if shared_output is not None: + hidden_states = hidden_states + shared_output + + return hidden_states + + # ----------------------------------------- TBO-related -------------------------------------------- + def _forward_ms_op_shared_expert( + self, + hidden_states: torch.Tensor, + ): + shared_output = self.shared_experts._forward_ms_mlp(hidden_states) + return shared_output + + def _forward_ms_op_gate( + self, + hidden_states: torch.Tensor, + ): + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + return router_logits + + def _forward_ms_op_tp_allgather( + self, + hidden_states: torch.Tensor, + chunk_hidden_states: torch.Tensor, + num_tokens: int = 0, + ): + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is None: + dist.all_gather(list(chunk_hidden_states), hidden_states, + self.tp_group) + final_hidden_states = torch.cat(chunk_hidden_states, dim=0) + if num_tokens > 0: + final_hidden_states = final_hidden_states[:-num_tokens] + else: + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + dist.all_gather(list(chunk_hidden_states), hidden_states, + self.tp_group) + final_hidden_states = torch.cat(chunk_hidden_states, dim=0) + if num_tokens > 0: + final_hidden_states = final_hidden_states[:-num_tokens] + current_ms_metadata.after_comm_event.record() + return final_hidden_states + + +class CustomDeepseekDBOMLAAttention(DeepseekV2MLAAttention): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: Optional[int], + kv_lora_rank: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + nn.Module.__init__(self) + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + + self.num_heads = num_heads + tp_size = get_tensor_model_parallel_world_size() + assert num_heads % tp_size == 0 + self.num_local_heads = num_heads // tp_size + + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + if self.q_lora_rank is not None: + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_a_proj") + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_b_proj") + else: + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj") + + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa") + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj") + self.o_proj = CustomDeepseekV2RowParallelLinear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) + if rope_scaling: + mscale_all_dim = rope_scaling.get("mscale_all_dim", False) + scaling_factor = rope_scaling["factor"] + mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) + self.scaling = self.scaling * mscale * mscale + + # In the MLA backend, kv_cache includes both k_c and + # pe (i.e. decoupled position embeddings). In particular, + # the concat_and_cache_mla op requires + # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) + # i.e. + # kv_lora_rank + qk_rope_head_dim == head_size + self.mla_attn = Attention( + num_heads=self.num_local_heads, + head_size=self.kv_lora_rank + self.qk_rope_head_dim, + scale=self.scaling, + num_kv_heads=1, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_mla=True, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_head_dim, + v_head_dim=self.v_head_dim, + rotary_emb=self.rotary_emb, + q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + o_proj=self.o_proj, + ) + + self.prefix = prefix + self.debug_layer_idx = int(self.prefix.split(".")[-2]) + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + if self.q_lora_rank is not None: + ckq = self.q_a_proj(hidden_states)[0] + hidden_states_or_q_c = self.q_a_layernorm(ckq) + else: + hidden_states_or_q_c = hidden_states + if self.torchair_graph_enabled: + forward_kwargs = {} + output_shape = hidden_states.shape + output = torch.empty(output_shape, + dtype=hidden_states_or_q_c.dtype, + device=hidden_states_or_q_c.device) + forward_kwargs['output'] = output + output = self.mla_attn.impl.forward(self.mla_attn, + hidden_states_or_q_c, + hidden_states, None, kv_cache, + attn_metadata, + **forward_kwargs) + output = output.view(-1, output_shape[-1]) + return output + else: + kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + return self.mla_attn(hidden_states_or_q_c, + kv_c_normed, + k_pe, + output_shape=hidden_states.shape) + + +class CustomDeepseekDBODecoderLayer(DeepseekV2DecoderLayer): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep='.')[-1]) + self.layer_idx = layer_idx + # TODO: enable mla in vllm-ascend + if model_config.use_mla: + attn_cls = CustomDeepseekDBOMLAAttention + else: + attn_cls = DeepseekV2Attention + self.self_attn = attn_cls( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, + kv_lora_rank=config.kv_lora_rank, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): + self.mlp = CustomDeepseekDBOMoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = CustomDeepseekDBOMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.routed_scaling_factor = config.routed_scaling_factor + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + previous_hidden_states, previous_residual = hidden_states, residual + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + # Dispose hidden_states and residual from the previous layer + # to save npu memory because they're no longer used. + dispose_tensor(previous_hidden_states) + dispose_tensor(previous_residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # We scale both hidden_states and residual before + # rmsnorm, and rmsnorm result would not affect by scale. + hidden_states *= 1. / self.routed_scaling_factor + if self.layer_idx == 0: + # The residual is shared by all layers, we only scale it on + # first layer. + residual *= 1. / self.routed_scaling_factor + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if isinstance(self.mlp, CustomDeepseekDBOMoE): + hidden_states = self.mlp(hidden_states, attn_metadata) + else: + hidden_states = self.mlp(hidden_states) + + if isinstance( + self.mlp, + CustomDeepseekDBOMLP) and hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # Scaling the DeepseekV2MLP output, it is the input of + # input_layernorm of next decoder layer. + # The scaling of DeepseekV2MOE output would be done in the forward + # of DeepseekV2MOE + hidden_states *= 1. / self.routed_scaling_factor + + return hidden_states, residual + + # ----------------------------------------- TBO-related -------------------------------------------- + def _forward_ms_layer( + self, + positions: List[torch.Tensor], + hidden_states: List[torch.Tensor], + residual: List[torch.Tensor], + attn_metadata: List[AttentionMetadata], + kv_cache: Optional[torch.Tensor] = None, + is_prefill: bool = False, + ) -> tuple[List[torch.Tensor], List[torch.Tensor]]: + layer_index, ms_metadata, _ = get_multistream_layer_context() + assert layer_index >= 0 and ms_metadata is not None + num_micro_batchs = ms_metadata.ms_config.num_micro_batches + assert isinstance(self.mlp, CustomDeepseekDBOMoE) + assert len(positions) == num_micro_batchs + assert len(hidden_states) == num_micro_batchs + assert residual is not None + assert attn_metadata is not None + num_tokens = [] + hidden_dims = [] + shared_outputs = [] + router_logits = [] + chunk_hidden_states = [] + + # block 1 : attention + # block 2 : attn tp communication + # the attn computation of microbatch 1 can be overlapped with the moe + # communication in the previous layer, and the attn computation of microbatch 2 + # can be overlapped with the attn communication of microbatch 1 + for i in range(num_micro_batchs): + # wait last layer moe finishing communication + ms_metadata.try_wait_event(layer_index - 1, i, + MSEventKey.FFN_AR_FINISH) + context = MultiStreamStepMetadata( + comm_stream=ms_metadata.communicate_stream, + before_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.ATTN_COM_FINISH], + after_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.ATTN_AR_FINISH], + ) + + with set_multistream_context(context, i): + forward_context = get_forward_context() + forward_context.attn_metadata = attn_metadata[i] + + # input layernorm + hidden_states[i], residual[ + i] = self._forward_ms_op_input_layernorm( + hidden_states[i], residual[i]) + # attention and tp allreduce + hidden_states[i], residual[i] = self._forward_ms_op_attn( + positions[i], hidden_states[i], residual[i], kv_cache, + attn_metadata[i]) + + # block 3 : shared experts + # if there is an allreduce ops in shared expert, we can overlap it with the computation of the + # shared expert for next microbatch or moe gating + for i in range(num_micro_batchs): + ms_metadata.try_wait_event(layer_index, i, + MSEventKey.ATTN_AR_FINISH) + context = MultiStreamStepMetadata( + comm_stream=ms_metadata.communicate_stream, + before_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.MOE_SE_COMP_FINISH], + after_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.MOE_SE_COMM_FINISH], + ) + with set_multistream_context(context, i): + # compute shared expert after finishing ATTN AR + hidden_states[i], residual[ + i] = self._forward_ms_op_post_attn_layernorm( + hidden_states[i], residual[i]) + + num_token, hidden_dim = hidden_states[i].shape + hidden_states[i] = hidden_states[i].view(-1, hidden_dim) + num_tokens.append(num_token) + hidden_dims.append(hidden_dim) + if self.mlp.n_shared_experts is not None: + # TODO: we can move shared expert computation into next block if reduce results is false + shared_output = self.mlp._forward_ms_op_shared_expert( + hidden_states[i]) + shared_outputs.append(shared_output) + + # block 4 : moe + for i in range(num_micro_batchs): + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + # TODO: need a better flag to indicate whether in profile run or not. + if attn_metadata[i] is None: + # for profile run + is_prefill = True + enable_force_load_balance = True + else: + is_prefill = attn_metadata[i].num_prefills > 0 + enable_force_load_balance = False + + if self.mlp.tp_size > 1: + num_token, _ = hidden_states[i].shape + padded_num_tokens = (self.mlp.tp_size - num_tokens[i] % + self.mlp.tp_size) % self.mlp.tp_size + if padded_num_tokens > 0: + hidden_states[i] = nn.functional.pad( + hidden_states[i], (0, 0, 0, padded_num_tokens)) + chunk_hidden_state = torch.tensor_split(hidden_states[i], + self.mlp.tp_size, + dim=0) + chunk_hidden_states.append(chunk_hidden_state) + local_hidden_states = chunk_hidden_state[self.mlp.tp_rank] + else: + local_hidden_states = hidden_states[i] + + router_logit = self.mlp._forward_ms_op_gate(local_hidden_states) + router_logits.append(router_logit) + + if CustomDeepseekDBOMoE.top_k: + real_top_k = CustomDeepseekDBOMoE.top_k + else: + real_top_k = self.mlp.experts.top_k + + hidden_states[i] = self.mlp.experts._forward_ms_fused_moe_comp( + local_hidden_states, router_logits[i], is_prefill, real_top_k, + enable_force_load_balance) + + # the following kernels will be submitted to the comm stream to overlap the computation of the + # moe computation of next microbatch and the attn computation of next layer + context = MultiStreamStepMetadata( + comm_stream=ms_metadata.communicate_stream, + before_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.FFN_COM_FINISH], + after_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.MOE_AFTER_COMM], + ) + context.before_comm_event.record() + with torch.npu.stream(ms_metadata.communicate_stream): + context.before_comm_event.wait() + if self.mlp.experts.reduce_results and ( + self.mlp.experts.tp_size > 1 + or self.mlp.experts.ep_size > 1): + hidden_states[i] = tensor_model_parallel_all_reduce( + hidden_states[i]) + hidden_states[ + i] = hidden_states[i] * self.mlp.routed_scaling_factor + context.after_comm_event.record() + + context = MultiStreamStepMetadata( + comm_stream=ms_metadata.communicate_stream, + before_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.MOE_AFTER_COMM], + after_comm_event=ms_metadata.ms_events[layer_index][i][ + MSEventKey.FFN_AR_FINISH], + ) + with set_multistream_context(context, i): + if self.mlp.tp_size > 1: + hidden_states[i] = self.mlp._forward_ms_op_tp_allgather( + hidden_states[i], chunk_hidden_states[i], + padded_num_tokens) + with torch.npu.stream(ms_metadata.communicate_stream): + # last + if shared_outputs[i] is not None: + hidden_states[i] = hidden_states[i] + shared_outputs[i] + hidden_states[i] = hidden_states[i].view( + num_tokens[i], hidden_dims[i]) + if isinstance(self.mlp, CustomDeepseekDBOMLP + ) and hidden_states[i].dtype == torch.float16: + # Fix FP16 overflow + # Scaling the DeepseekV2MLP output, it is the input of + # input_layernorm of next decoder layer. + # The scaling of DeepseekV2MOE output would be done in the forward + # of DeepseekV2MOE + hidden_states[i] *= 1. / self.routed_scaling_factor + context.after_comm_event.record() + return hidden_states, residual + + # should split ops in Decoder Layer + def _forward_ms_op_input_layernorm( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + return hidden_states, residual + + def _forward_ms_op_attn( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + if hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # We scale both hidden_states and residual before + # rmsnorm, and rmsnorm result would not affect by scale. + hidden_states *= 1. / self.routed_scaling_factor + if self.layer_idx == 0: + # The residual is shared by all layers, we only scale it on + # first layer. + residual *= 1. / self.routed_scaling_factor + return hidden_states, residual + + def _forward_ms_op_post_attn_layernorm( + self, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ): + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + return hidden_states, residual + + +class CustomDeepseekDBOModel(nn.Module): + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.first_k_dense_replace = config.first_k_dense_replace + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: CustomDeepseekDBODecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers") + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + # tbo related members + if VLLM_ASCEND_ENABLE_DBO: + self.use_mla = model_config.use_mla + self.multistream_config = MultiStreamConfig() + multistream_metadata = make_multistream_metadata_ds( + start_layer=self.start_layer + self.first_k_dense_replace, + end_layer=self.end_layer, + causal_lm=getattr(config, "causal_lm", True), + multistream_config=self.multistream_config, + ) + self.ms_pre_layer = MultiStreamPreTransformerLayer( + multistream_metadata) + self.ms_post_layer = MultiStreamPostTransformerLayer( + multistream_metadata) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + num_normal_layers = (self.first_k_dense_replace + if VLLM_ASCEND_ENABLE_DBO and self.can_run_ms() + else self.end_layer - self.start_layer) + + moe_start_layer = self.start_layer + num_normal_layers + for i in range(self.start_layer, min(moe_start_layer, self.end_layer)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, hidden_states, residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata) + + if moe_start_layer < self.end_layer: + # if we enable multistream/dbo, process sparse layers here + hidden_states, residual = self._forward_ms_layers( + positions=positions, + hidden_states=hidden_states, + residual=residual, + moe_start_layer=moe_start_layer, + kv_caches=kv_caches, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def can_run_ms(self): + attn_metadata = get_forward_context().attn_metadata + # enable prefill overlap + return not (attn_metadata is None or attn_metadata.num_prefills == 0 + or not attn_metadata.enable_dbo_across_dp) + + def _forward_ms_layers( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor, + moe_start_layer: int, + kv_caches: Optional[List[torch.Tensor]] = None, + is_prefill: bool = False, + ): + + if moe_start_layer == self.end_layer: + return hidden_states, residual + + attn_metadata, [positions, hidden_states, + residual] = self.ms_pre_layer( + [positions, hidden_states, residual], ) + # the rest layers + for i in range(moe_start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer._forward_ms_layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + attn_metadata=attn_metadata, + kv_cache=kv_caches[i - self.start_layer] + if kv_caches is not None else None, + is_prefill=is_prefill) + advance_step_multistream_layer_context() + + [hidden_states, + residual] = self.ms_post_layer([hidden_states, residual], ) + return hidden_states, residual + + +class CustomDeepseekDBOForCausalLM(DeepseekV2ForCausalLM): + # add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = CustomDeepseekDBOModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + # NOTE: This `load_weights` is mainly copied from + # https://github.com/vllm-project/vllm/commit/07b8fae219b1fff51ef115c38c44b51395be5bb5 + # to fix CI, and it is different from the implementation in main + # TODO: support eplb style load_weights + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + """""" + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = AscendFusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + return_success=False) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py new file mode 100644 index 0000000..8bcc4fb --- /dev/null +++ b/vllm_ascend/models/deepseek_mtp.py @@ -0,0 +1,218 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Adapted from vllm/model_executor/models/deepseek_mtp.py +# Copyright 2023 The vLLM team. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.models.deepseek_mtp import ( + DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, + SharedHead) +from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .deepseek_v2 import CustomDeepseekV2DecoderLayer + + +class CustomDeepSeekShareHead(SharedHead): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + nn.Module.__init__(self) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "head")) + + +class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + nn.Module.__init__(self) + + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.eh_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.shared_head = CustomDeepSeekShareHead(config=config, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "shared_head")) + self.mtp_block = CustomDeepseekV2DecoderLayer(config, prefix, + model_config, + cache_config, + quant_config) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds = torch.where((positions == 0).unsqueeze(-1), + torch.zeros_like(inputs_embeds), + inputs_embeds) + inputs_embeds = self.enorm(inputs_embeds) + previous_hidden_states = self.hnorm(previous_hidden_states) + + hidden_states = self.eh_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + residual=None) + hidden_states = residual + hidden_states + return hidden_states + + +class CustomDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + CustomDeepSeekMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + quant_config=vllm_config.quant_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + + # Note: torch._dynamo.exc.Unsupported: builtin: str + self.layers_list = [ + self.layers[str(idx)] + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + ] + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: torch.Tensor, + attn_metadata: AttentionMetadata, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + current_step_idx = (spec_step_idx % self.num_mtp_layers) + step_kv_cache = kv_caches[ + current_step_idx] if kv_caches is not None else None + return self.layers_list[current_step_idx]( + input_ids, + positions, + step_kv_cache, + attn_metadata, + previous_hidden_states, + inputs_embeds, + current_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + current_step_idx = (spec_step_idx % self.num_mtp_layers) + mtp_layer = self.layers_list[current_step_idx] + logits = self.logits_processor(mtp_layer.shared_head.head, + mtp_layer.shared_head(hidden_states), + sampling_metadata) + return logits + + +class CustomDeepSeekMTP(DeepSeekMTP): + # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; + # NOTE 2.The description file generated by the current msmodelslim tool does not have + # MTP layer info. Please manually add it and set the value to FLOAT. + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + self.config = vllm_config.model_config.hf_config + self.model = CustomDeepSeekMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + + self.sampler = get_sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + previous_hidden_states: Optional[torch.Tensor] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, previous_hidden_states, + inputs_embeds, spec_step_idx) + return hidden_states \ No newline at end of file diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py new file mode 100644 index 0000000..6d0913c --- /dev/null +++ b/vllm_ascend/models/deepseek_v2.py @@ -0,0 +1,997 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Adapted from +# # vllm-project/vllm/blob/main/vllm/model_executor/models/deepseek_v2.py +# # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py +# """Inference-only DeepseekV2/DeepseekV3 model.""" + +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +import torch +import torch_npu +from torch import nn +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.config import (CacheConfig, ModelConfig, VllmConfig, + get_current_vllm_config) +from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, split_tensor_along_last_dim, + tensor_model_parallel_all_reduce, + tensor_model_parallel_reduce_scatter) +from vllm.distributed.parallel_state import get_dp_group, get_ep_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, + UnquantizedLinearMethod) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import \ + DeepseekV2ForCausalLM # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import \ + yarn_get_mscale # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import ( + DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention, + get_spec_layer_idx_from_weight_name) +from vllm.model_executor.models.utils import ( + PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.quantization.quant_config import AscendLinearMethod +from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod +from vllm_ascend.utils import dispose_tensor + + +class CustomDeepseekV2SiluAndMul(SiluAndMul): + + def __init__(self, + *, + weight_scale: Optional[Callable[[], torch.Tensor]] = None): + super().__init__() + self.weight_scale = weight_scale + + def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]): + if isinstance(x, tuple): + assert self.weight_scale is not None + # For AscendW8A8DynamicLinearMethod: + # a dynamic scale is passed along with the quantized value. + quantized_x, dynamic_scale = x + return torch_npu.npu_dequant_swiglu_quant( + x=quantized_x, + weight_scale=self.weight_scale(), + activation_scale=dynamic_scale, + activate_left=True, + quant_mode=1) + else: + return super().forward_oot(x) + + +class CustomDeepseekV2MergedReplicatedLinear(ReplicatedLinear): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias=bias, + quant_config=quant_config, + prefix=prefix) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, loaded_shard_id: int): + # With no support for GGUF format yet. + assert not getattr(param, "is_gguf_weight", False) + assert not getattr(param, "is_gguf_weight_type", False) + + assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + shard = param.data.narrow(param.output_dim, shard_offset, shard_size) + + assert shard.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter shard of id {loaded_shard_id} size {shard.size()}" + ) + shard.copy_(loaded_weight) + + +class CustomDeepseekV2RowParallelLinearReplaceAllreduce(RowParallelLinear): + + def forward( + self, + input_, + is_prefill=True, + is_force_scatter=False + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]: + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + num_tokens = output_parallel.shape[0] + if is_force_scatter and num_tokens % self.tp_size: + output_parallel = nn.functional.pad( + output_parallel, (0, 0, 0, -num_tokens % self.tp_size)) + if is_force_scatter or (not is_prefill + and output_parallel.shape[0] % self.tp_size + == 0): + output = tensor_model_parallel_reduce_scatter(output_parallel, + dim=0) + else: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class CustomDeepseekV2RowParallelLinear(RowParallelLinear): + + def forward( + self, + input_, + is_prefill=True, + is_force_scatter=False + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]: + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class CustomDeepseekV2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + force_replicate: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + if not force_replicate: + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + else: + self.gate_up_proj = CustomDeepseekV2MergedReplicatedLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = ReplicatedLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + + quant_method = self.gate_up_proj.quant_method + if isinstance(quant_method, UnquantizedLinearMethod): + self.act_fn = CustomDeepseekV2SiluAndMul() + elif (isinstance(quant_method, AscendLinearMethod) and isinstance( + quant_method.quant_method, AscendW8A8DynamicLinearMethod)): + # TODO(sdmyzlp): Currently preserved as before: + # 1. The only quantization supported for silu is W8A8Dynamic + # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16 + # + # Maybe one can implement a better and more general configuration + # scheme, e.g. by somehow passing around the tweaked `quant_config` + self.act_fn = CustomDeepseekV2SiluAndMul( + # Use lazy binding, for `weight_scale_fp32` is accessible + # only after `process_weights_after_loading`. + weight_scale=lambda: self.gate_up_proj.weight_scale_fp32) + # To be consumed by AscendW8A8DynamicLinearMethod.apply() + self.gate_up_proj._ascend_quant_config = { + "output_dtype": torch.int32, + "pertoken_scale": False, + "return_scale": True, + } + self.down_proj._ascend_quant_config = { + "output_dtype": torch.bfloat16, + "pertoken_scale": True, + "return_scale": False, + } + else: + raise NotImplementedError( + f"Quantization with [{type(quant_method)}] is NOT supported") + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class CustomDeepseekV2MoE(nn.Module): + + top_k: int + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + self.n_shared_experts = config.n_shared_experts + if self.tp_size > config.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.n_routed_experts}.") + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and \ + self.torchair_graph_enabled + + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + if config.topk_method == "noaux_tc": + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts)) + else: + self.gate.e_score_correction_bias = None + + self.experts = AscendFusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + e_score_correction_bias=self.gate.e_score_correction_bias) + + if config.n_shared_experts is not None: + self.all_reduce_merge = self.experts.all_reduce_merge + reduce_results = not self.all_reduce_merge + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + self.shared_experts = CustomDeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=reduce_results, + force_replicate=self.enable_multistream_moe + or enable_shared_expert_dp, + prefix=f"{prefix}.shared_experts", + ) + else: + self.shared_experts = None # type: ignore + CustomDeepseekV2MoE.top_k = config.num_experts_per_tok + + self.dp_size = get_dp_group().world_size + + self.tp_group = get_tp_group().device_group + self.tp_rank = get_tp_group().rank_in_group + self.ep_group = get_ep_group() + self.kv_consumer = None + transfer_config = get_current_vllm_config().kv_transfer_config + if transfer_config is not None: + self.kv_consumer = transfer_config.kv_role == "kv_consumer" + + self.params_dtype = torch.get_default_dtype() + self.rm_router_logits = self.experts.rm_router_logits + + def forward(self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None, + replace_allreduce: bool = False) -> torch.Tensor: + + forward_context = get_forward_context() + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + + enable_force_load_balance = forward_context.in_profile_run + + is_prefill = forward_context.with_prefill + + # If this node is kv_consumer, we force the moe always runs in decode path to make sure + # the behaviour aligned between dummy_run and normal model_execute. + if self.kv_consumer: + is_prefill = False + enable_force_load_balance = False + + # router_logits: (num_tokens, n_experts) + router_logits = None + if not self.rm_router_logits and not self.enable_multistream_moe: + router_logits, _ = self.gate(hidden_states) + + experts_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=CustomDeepseekV2MoE.top_k, + enable_force_load_balance=enable_force_load_balance, + shared_experts=self.shared_experts, + gate=self.gate, + replace_allreduce=replace_allreduce) + + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) + if self.all_reduce_merge: + # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce + hidden_states = tensor_model_parallel_all_reduce(hidden_states) + + return hidden_states + + +class CustomDeepseekV2MLAAttention(DeepseekV2MLAAttention): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: Optional[int], + kv_lora_rank: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + nn.Module.__init__(self) + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + + self.num_heads = num_heads + self.tp_size = get_tensor_model_parallel_world_size() + assert num_heads % self.tp_size == 0 + self.num_local_heads = num_heads // self.tp_size + self.layers = config.num_hidden_layers + self.first_k_dense_replace = config.first_k_dense_replace + + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.prefix = prefix + self.debug_layer_idx = int(self.prefix.split(".")[-2]) + + ascend_config = get_ascend_config() + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + if self.q_lora_rank is not None: + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_a_proj") + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_b_proj") + else: + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj") + + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa") + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj") + if (config.n_routed_experts is not None + and self.debug_layer_idx >= config.first_k_dense_replace + and self.debug_layer_idx % config.moe_layer_freq == 0 + and self.enable_shared_expert_dp): + self.o_proj = CustomDeepseekV2RowParallelLinearReplaceAllreduce( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + else: + self.o_proj = CustomDeepseekV2RowParallelLinear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) + if rope_scaling: + mscale_all_dim = rope_scaling.get("mscale_all_dim", False) + scaling_factor = rope_scaling["factor"] + mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) + self.scaling = self.scaling * mscale * mscale + + # In the MLA backend, kv_cache includes both k_c and + # pe (i.e. decoupled position embeddings). In particular, + # the concat_and_cache_mla op requires + # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) + # i.e. + # kv_lora_rank + qk_rope_head_dim == head_size + self.mla_attn = Attention( + num_heads=self.num_local_heads, + head_size=self.kv_lora_rank + self.qk_rope_head_dim, + scale=self.scaling, + num_kv_heads=1, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_mla=True, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_head_dim, + v_head_dim=self.v_head_dim, + rotary_emb=self.rotary_emb, + q_a_proj=self.q_a_proj if self.q_lora_rank is not None else None, + q_a_layernorm=self.q_a_layernorm + if self.q_lora_rank is not None else None, + q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + o_proj=self.o_proj, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() + if kv_cache is None: + kv_cache = self.mla_attn.kv_cache[forward_context.virtual_engine] + num_tokens = hidden_states.shape[0] + need_gather_q_kv = False + if self.enable_shared_expert_dp and self.debug_layer_idx > self.first_k_dense_replace and self.debug_layer_idx < self.layers: + # Simulate all gather to calculate output shape + num_tokens = num_tokens * self.tp_size + need_gather_q_kv = True + if not self.enable_shared_expert_dp or self.debug_layer_idx < self.first_k_dense_replace: + output_shape = hidden_states.shape + else: + rows = num_tokens // self.tp_size + if num_tokens % self.tp_size: + rows += 1 + output_shape = (rows, hidden_states.shape[1]) + output = torch.empty(output_shape, + dtype=hidden_states.dtype, + device=hidden_states.device) + output = self.mla_attn.impl.forward(hidden_states, kv_cache, + forward_context.attn_metadata, + need_gather_q_kv, output) + output = output.view(-1, output_shape[-1]) + return output + + +class CustomDeepseekV2DecoderLayer(DeepseekV2DecoderLayer): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep='.')[-1]) + self.layer_idx = layer_idx + self.layers = config.num_hidden_layers + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tp_group().rank_in_group + ascend_config = get_ascend_config() + # TODO: enable mla in vllm-ascend + if model_config.use_mla: + attn_cls = CustomDeepseekV2MLAAttention + else: + attn_cls = DeepseekV2Attention + self.self_attn = attn_cls( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, + kv_lora_rank=config.kv_lora_rank, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): + self.mlp = CustomDeepseekV2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = CustomDeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.routed_scaling_factor = config.routed_scaling_factor + self.first_k_dense_replace = config.first_k_dense_replace + self.tp_group = get_tp_group().device_group + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + replace_allreduce: bool = False, + ) -> torch.Tensor: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + previous_hidden_states, previous_residual = hidden_states, residual + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + # Dispose hidden_states and residual from the previous layer + # to save npu memory because they're no longer used. + dispose_tensor(previous_hidden_states) + dispose_tensor(previous_residual) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # We scale both hidden_states and residual before + # rmsnorm, and rmsnorm result would not affect by scale. + hidden_states *= 1. / self.routed_scaling_factor + if self.layer_idx == 0: + # The residual is shared by all layers, we only scale it on + # first layer. + residual *= 1. / self.routed_scaling_factor + + tp_size = get_tensor_model_parallel_world_size() + if self.enable_shared_expert_dp and ( + self.layer_idx == self.first_k_dense_replace + or self.layer_idx == self.layers) and tp_size > 1: + num_tokens, _ = residual.shape + if num_tokens % tp_size: + residual = nn.functional.pad(residual, + (0, 0, 0, -num_tokens % tp_size)) + chunk_residual = torch.tensor_split(residual, tp_size, dim=0) + tp_rank = get_tensor_model_parallel_rank() + residual = chunk_residual[tp_rank] + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if isinstance(self.mlp, CustomDeepseekV2MoE): + hidden_states = self.mlp(hidden_states, attn_metadata) + else: + hidden_states = self.mlp(hidden_states) + + if isinstance( + self.mlp, + CustomDeepseekV2MLP) and hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # Scaling the DeepseekV2MLP output, it is the input of + # input_layernorm of next decoder layer. + # The scaling of DeepseekV2MOE output would be done in the forward + # of DeepseekV2MOE + hidden_states *= 1. / self.routed_scaling_factor + + # for last layer of main model and mtp layer. + if self.enable_shared_expert_dp and self.layer_idx >= ( + self.layers - 1) and tp_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + residual = get_tp_group().all_gather(residual, 0) + + attn_metadata = get_forward_context().attn_metadata + if attn_metadata is not None: + num_tokens = attn_metadata.num_actual_tokens + else: + num_tokens = hidden_states.shape[0] + + if num_tokens < hidden_states.shape[0]: + hidden_states = hidden_states[:num_tokens] + residual = residual[:num_tokens] + + return hidden_states, residual + + +class CustomDeepseekV2Model(nn.Module): + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.tp_size = get_tensor_model_parallel_world_size() + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: CustomDeepseekV2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers") + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + replace_allreduce = hidden_states.shape[0] % self.tp_size == 0 + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata, + replace_allreduce=replace_allreduce) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): + # add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = CustomDeepseekV2Model(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + # NOTE: This `load_weights` is mainly copied from + # https://github.com/vllm-project/vllm/commit/07b8fae219b1fff51ef115c38c44b51395be5bb5 + # to fix CI, and it is different from the implementation in main + # TODO: support eplb style load_weights + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + """""" + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = AscendFusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "module" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + return_success=False) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm_ascend/models/deepseek_v3.py b/vllm_ascend/models/deepseek_v3.py new file mode 100644 index 0000000..4d09ef0 --- /dev/null +++ b/vllm_ascend/models/deepseek_v3.py @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2ForCausalLM + + +class CustomDeepseekV3ForCausalLM(CustomDeepseekV2ForCausalLM): + pass diff --git a/vllm_ascend/models/pangu_moe.py b/vllm_ascend/models/pangu_moe.py new file mode 100644 index 0000000..3e2148c --- /dev/null +++ b/vllm_ascend/models/pangu_moe.py @@ -0,0 +1,1106 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union + +import torch +import torch.distributed as dist +import torch.nn.functional as F +import torch_npu +from torch import nn +from torch.nn import Parameter +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import (divide, get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group, get_world_group) +from vllm.forward_context import get_forward_context +from vllm.logger import logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearBase, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.utils import ( + extract_layer_index, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors + +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p + +_ROUTER_SCALE = None + + +def use_h2p(): + # only use H2P when dp_size > 1. + if get_dp_group().world_size > 1: + return True + return False + + +# This class is adapted from vllm.model_executor.layers.linear.MergedColumnParallelLinear. +# It is used to customize parallelism of certain linear(e.g., shared experts with all-rank tp). +class CustomMergedColumnParallelLinear(LinearBase): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + # Divide the weight matrix along the last dimension. + output_size = sum(output_sizes) + self.output_sizes = output_sizes + self.tp_size = get_tp_group().world_size + self.input_size_per_partition = input_size + self.output_size_per_partition = divide(output_size, self.tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, self.tp_size) + for output_size in self.output_sizes + ] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.gather_output = gather_output + + if output_sizes is None: + output_sizes = [output_size] + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, + loaded_shard_id: int): + param_data = param.data + output_dim = getattr(param, "output_dim", None) + + assert loaded_shard_id < len(self.output_sizes) + + tp_rank = get_tp_group().rank_in_group + tp_size = get_tp_group().world_size + if output_dim is not None: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + + is_sharded_weight = getattr(param, "is_sharded_weight", False) + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = tp_rank * shard_size + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions.") + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + assert self.quant_method is not None + output_parallel = self.quant_method.apply(self, input_, bias) + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output + return output, output_bias + + +# This class is adapted from vllm.model_executor.layers.linear.RowParallelLinear. +# It is used to customize parallelism of certain linear(e.g., shared experts with all-rank tp) +# and detach communication to enable customized communication algorithms(e.g., H2P). +class CustomRowParallelLinear(LinearBase): + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + group=None, + ): + # Divide the weight matrix along the first dimension. + self.group = group if group is not None else get_tp_group() + self.tp_rank = self.group.rank_in_group + self.tp_size = self.group.world_size + self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + tp_rank = self.group.rank_in_group + input_dim = getattr(param, "input_dim", None) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + is_sharded_weight = is_sharded_weight + + param_data = param.data + if input_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[input_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + input_parallel = input_ + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output = self.quant_method.apply(self, input_parallel, bias=bias_) + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class PanguProMoEMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + if not use_h2p(): + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + else: + self.gate_up_proj = CustomMergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = CustomRowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +def topk_wrapper(num_voted_experts): + + def pangu_group8_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool = False, + num_expert_group: int = 0, + topk_group: int = 0, + global_num_experts: int = 0, + ): + scores = F.softmax(gating_output, dim=1) + num_tokens = scores.shape[0] + router_scale = _ROUTER_SCALE.squeeze( # type: ignore + ) + # TODO: support disable expert parallel + ep_size = get_ep_group().world_size + local_num_experts = global_num_experts // ep_size + local_num_group = topk // ep_size + experts_per_group = global_num_experts // topk + local_group_start = get_ep_group().rank_in_group * local_num_experts + local_group_end = (get_ep_group().rank_in_group + + 1) * local_num_experts + scores = F.softmax(gating_output, dim=1) + scores = scores[..., local_group_start:local_group_end] + + router_weights = router_scale[local_group_start:local_group_end] + + if num_voted_experts == 8: + # use original topk + topk_weights, topk_ids = torch.max(scores.view( + scores.shape[0], local_num_group, -1), + dim=-1) + bias = torch.arange(0, + local_num_experts, + experts_per_group, + device=scores.device, + dtype=torch.int32).unsqueeze(0) + topk_ids = topk_ids.to(torch.int32) + bias + + else: + group_expert_indices = torch.arange(experts_per_group, + dtype=torch.int32, + device=scores.device).view( + 1, 1, -1) + group_expert_offset = (torch.arange( + local_num_group, dtype=torch.int32, device=scores.device) * + experts_per_group).unsqueeze(0) + expert_index_range = torch.arange(experts_per_group, + dtype=torch.int32, + device=scores.device) + + scores_grouped = scores.view(num_tokens, local_num_group, + experts_per_group) + best_expert_idx = torch.argmax(scores_grouped, + dim=2) # (num_tokens, num_groups) + vote_mask = (best_expert_idx.unsqueeze(-1).to( + torch.int32) == group_expert_indices) + + expert_vote_freq = vote_mask.sum(dim=0) + + sorted_indices = torch.argsort(expert_vote_freq, + dim=1, + descending=True).to(torch.int32) + topk_experts = sorted_indices[:, :num_voted_experts] + keep_mask = (( + topk_experts.unsqueeze(-1) == expert_index_range).any( + dim=1)).unsqueeze(0) + + masked_scores = torch.where(keep_mask, scores_grouped, 0) + + topk_weights, best_pos_in_group = masked_scores.max(dim=2) + best_pos_in_group = best_pos_in_group.to(torch.int32) + topk_ids = (best_pos_in_group + group_expert_offset).to( + torch.int32) + + flatten_topk_ids = topk_ids.view(-1) + router_weights = router_weights.index_select(0, flatten_topk_ids).view( + topk_ids.shape) + topk_weights *= router_weights + + return topk_weights, topk_ids + + return pangu_group8_topk + + +class PanguProMoESparseMoeBlock(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.num_experts = config.num_experts + + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + self.num_experts_per_tok = config.num_experts_per_tok + self.router_scale = torch.nn.Parameter( + torch.ones((1, self.num_experts))) + + # on 300I Duo platform, we find that num_voted_experts set to 5 achieves + # good performance without sacrifice too much accuracy. for other platform, + # this is set to 8 to use original pangu grouped topk. + num_voted_experts = 5 if is_310p() else 8 + + self.experts = FusedMoE( + num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + quant_config=quant_config, + custom_routing_function=topk_wrapper(num_voted_experts), + prefix=f"{prefix}.experts", + ) + self.use_ep = self.experts.use_ep + + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + if config.shared_expert_intermediate_size > 0: + self.shared_expert = PanguProMoEMLP( + hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_expert", + ) + else: + self.shared_expert = None # type: ignore + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + # NOTE: hidden_states can have either 1D or 2D shape. + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + shared_output = None + if self.shared_expert is not None: + shared_output = self.shared_expert(hidden_states) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + global _ROUTER_SCALE + _ROUTER_SCALE = self.router_scale + + # TODO(angazenn): Does not support MC2 currently + get_forward_context().moe_comm_method_name = "allgathercommimpl" + + if not use_h2p(): + final_hidden_states = self.experts.forward_impl( + hidden_states=hidden_states, router_logits=router_logits) + else: + # TODO: when using h2p, we have to skip communication in vLLM + # native FusedMoE. here we need to design a better FusedMoE + # (maybe using AscendFusedMoE) to enable these different + # communication schema. + final_hidden_states = self.experts.quant_method.apply( + layer=self.experts, + x=hidden_states, + router_logits=router_logits, + top_k=self.experts.top_k, + renormalize=False, + use_grouped_topk=False, + global_num_experts=self.experts.global_num_experts, + expert_map=self.experts.expert_map, + custom_routing_function=self.experts.custom_routing_function, + apply_router_weight_on_input=self.experts. + apply_router_weight_on_input) + + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + if not use_h2p(): + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class PanguProMoEAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + if use_h2p(): + self.o_proj = CustomRowParallelLinear(self.total_num_heads * + self.head_dim, + hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + group=get_tp_group()) + else: + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + + output, _ = self.o_proj(attn_output) + return output + + +class PanguProMoEDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + + self.self_attn = PanguProMoEAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + # `mlp_only_layers` in the config. + layer_idx = extract_layer_index(prefix) + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + if (layer_idx not in mlp_only_layers) and (config.num_experts > 0): + self.mlp = PanguProMoESparseMoeBlock( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = PanguProMoEMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + h2p_unpad_idx: Optional[torch.Tensor] = None, + h2p_pad_idx: Optional[torch.Tensor] = None, + is_start_layer: Optional[bool] = False, + ) -> torch.Tensor: + need_h2p_pad = h2p_unpad_idx is not None and h2p_pad_idx is not None \ + and h2p_unpad_idx.shape[0] < h2p_pad_idx.shape[0] + tp_size = get_tp_group().world_size + + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + if use_h2p(): + if is_start_layer: + if need_h2p_pad: + residual = residual.index_select(dim=0, index=h2p_pad_idx) + residual = torch.tensor_split( + residual, tp_size)[get_tp_group().rank_in_group] + else: + if tp_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + if need_h2p_pad: + hidden_states = hidden_states.index_select( + dim=0, index=h2p_unpad_idx) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if use_h2p(): + if need_h2p_pad: + hidden_states = hidden_states.index_select(dim=0, + index=h2p_pad_idx) + if tp_size > 1: + hidden_states = dist._functional_collectives.reduce_scatter_tensor( + hidden_states, + "sum", + scatter_dim=0, + group=get_tp_group().device_group) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if use_h2p(): + all_rank_group = get_world_group().device_group + output_size = (hidden_states.shape[0] * + get_world_group().world_size, + hidden_states.shape[1]) + # Allocate output tensor. + output_tensor = torch.empty(output_size, + dtype=hidden_states.dtype, + device=hidden_states.device) + # All-gather. + dist.all_gather_into_tensor(output_tensor, + hidden_states, + group=all_rank_group) + hidden_states = output_tensor + + hidden_states = self.mlp(hidden_states, attn_metadata=attn_metadata) + + if use_h2p(): + hidden_states = dist._functional_collectives.reduce_scatter_tensor( + hidden_states, + "sum", + scatter_dim=0, + group=get_world_group().device_group) + + return hidden_states, residual + + +@support_torch_compile +class PanguProMoEModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: PanguProMoEDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + if use_h2p(): + # calculate necessary padding/unpadding idx before model forward. + + # the attn_metadata will be passed directly when use torchair. + # if attn_meatadata is not passed, we try to get it from forward_context. + if attn_metadata is None: + attn_metadata = get_forward_context().attn_metadata + + max_tokens_across_dp = get_forward_context().max_tokens_across_dp + + tp_size = get_tp_group().world_size + # reduce scatter will split the input tensor into equal sizes and then scatter them on all ranks. + # we need pad it before if the shape can't be divided by group size. + # for h2p, we need pad it so that it can be divided by tp_size. + h2p_padded_len = ( + tp_size - (max_tokens_across_dp % tp_size) + ) % tp_size + max_tokens_across_dp - hidden_states.shape[0] + h2p_unpad_idx = torch.arange(hidden_states.shape[0], + device=hidden_states.device, + dtype=torch.int32) + h2p_pad_idx = torch.cat([ + h2p_unpad_idx, + torch.zeros(h2p_padded_len, + dtype=torch.int32, + device=hidden_states.device) + ]) + else: + h2p_unpad_idx = None + h2p_pad_idx = None + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, hidden_states, residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata, h2p_unpad_idx, h2p_pad_idx, + i == self.start_layer) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + if use_h2p(): + if get_tp_group().world_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + if h2p_unpad_idx.shape[0] < h2p_pad_idx.shape[0]: + hidden_states = hidden_states.index_select(dim=0, + index=h2p_unpad_idx) + return hidden_states + + +class PanguProMoEForCausalLM(nn.Module, SupportsPP): + + fall_back_to_pt_during_load = False + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = PanguProMoEModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head", + ) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + tp_size = get_tp_group().world_size + tp_rank = get_tp_group().rank_in_group + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts) + + # expert_params_mapping = [] + + params_dict = dict(self.named_parameters()) # from model + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + # ======================================================= + # BF: add this to load with less layers + if 'layers' in name: + layer_idx = int(name.split('layers.')[-1].split('.')[0]) + if layer_idx >= self.model.end_layer: + continue + + if "rotary_emb.inv_freq" in name: + continue + + if "module" in name: + continue + + if name.endswith('kv_cache_offset'): + continue + + if name.endswith("k_proj.kv_cache_scale"): + remapped_kv_scale_name = name.replace( + "k_proj.kv_cache_scale", "attn.key_antiquant_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + loaded_weight = torch.tensor_split(loaded_weight, + tp_size, + dim=0)[tp_rank] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + if name.endswith("v_proj.kv_cache_scale"): + remapped_kv_scale_name = name.replace( + "v_proj.kv_cache_scale", "attn.value_antiquant_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + loaded_weight = torch.tensor_split(loaded_weight, + tp_size, + dim=0)[tp_rank] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + # breakpoint() + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + # breakpoint() + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + if is_310p() and "head" in name: + # on 300I Duo platform, ACL_FORMAT_FRACTAL_NZ is much more preferred than + # ACL_FORMAT_FRACTAL_ND by matmul operation. Since lmhead is also implemented + # by linear, we manually cast the format here. + param.data = torch_npu.npu_format_cast(param.data, + ACL_FORMAT_FRACTAL_NZ) + return loaded_params diff --git a/vllm_ascend/models/qwen2_5_vl.py b/vllm_ascend/models/qwen2_5_vl.py new file mode 100644 index 0000000..31ad260 --- /dev/null +++ b/vllm_ascend/models/qwen2_5_vl.py @@ -0,0 +1,491 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Adapted from vllm/model_executor/models/qwen2_5_vl.py +# Copyright 2023 The vLLM team. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Callable, Iterable, Optional, Set, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_npu +from einops import rearrange +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) +from vllm.config import VllmConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionRotaryEmbedding, Qwen2_5_VisionTransformer, + Qwen2_5_VLDummyInputsBuilder, Qwen2_5_VLForConditionalGeneration, + Qwen2_5_VLMultiModalProcessor, Qwen2_5_VLProcessingInfo) +from vllm.model_executor.models.utils import maybe_prefix +from vllm.multimodal import MULTIMODAL_REGISTRY + +MIN_PAD_SIZE = 64 # min_size to pad weight +MAX_PAD_SIZE = 128 # max_size to pad weight + + +class AscendQwen2_5_VisionAttention(Qwen2_5_VisionAttention): + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__( + embed_dim, + num_heads, + projection_size, + quant_config, + prefix, + ) + self.embed_dim = embed_dim + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head + if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: + self.hidden_size_per_attention_head = MAX_PAD_SIZE + + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.origin_hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer) + + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() + + output, _ = self.proj(context_layer) + return output + + +class AscendQwen2_5_VisionBlock(Qwen2_5_VisionBlock): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer, + quant_config, prefix) + self.attn = AscendQwen2_5_VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) + + x = x + self.mlp(self.norm2(x)) + return x + + +class AscendQwen2_5_VisionPatchEmbed(Qwen2_5_VisionPatchEmbed): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.matmul( + self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) + return x + + +class AscendQwen2_5_VisionRotaryEmbedding(Qwen2_5_VisionRotaryEmbedding): + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__(dim, theta) + inv_freq = 1.0 / (theta + **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + self.inv_freq = inv_freq + + +class AscendQwen2_5_VisionTransformer(Qwen2_5_VisionTransformer): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + interleaved=False, + ) -> None: + super().__init__(vision_config, norm_eps, quant_config, prefix) + norm_layer = partial(RMSNorm, eps=norm_eps) + self.interleaved = interleaved + self.enable_pad = False + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // + 2) + self.patch_embed = AscendQwen2_5_VisionPatchEmbed( + patch_size=vision_config.patch_size, + temporal_patch_size=vision_config.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + + act_fn = get_act_and_mul_fn(vision_config.hidden_act) + self.blocks = nn.ModuleList([ + AscendQwen2_5_VisionBlock( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=act_fn, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(vision_config.depth) + ]) + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + self.hidden_size, self.num_heads) + + if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: + self.enable_pad = True + self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head + self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 + self.half_pad_hidden_size_per_attention_head = ( + MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 + self.hidden_size_per_attention_head = MAX_PAD_SIZE + + def cal_cos_sin(self, rotary_pos_emb): + cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] + sin = rotary_pos_emb.sin() + if self.enable_pad: + cos = torch.nn.functional.pad( + cos, (0, self.half_pad_hidden_size_per_attention_head)) + sin = torch.nn.functional.pad( + sin, (0, self.half_pad_hidden_size_per_attention_head)) + + if not self.interleaved: + cos_new = torch.cat((cos, cos), dim=-1) + sin_new = torch.cat((sin, sin), dim=-1) + else: + cos_new = rearrange(torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2) + sin_new = rearrange(torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2) + cos_new = cos_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + sin_new = sin_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + return cos_new, sin_new + + def pad_qkv_bias(self, bias): + first_half = bias.reshape( + -1, 3, self.origin_hidden_size_per_attention_head + )[:, :, :self.half_origin_hidden_size_per_attention_head] + second_half = bias.reshape( + -1, 3, self.origin_hidden_size_per_attention_head + )[:, :, self.half_origin_hidden_size_per_attention_head:] + first_half_padded = torch.nn.functional.pad( + first_half, (0, self.half_pad_hidden_size_per_attention_head)) + second_half_padded = torch.nn.functional.pad( + second_half, (0, self.half_pad_hidden_size_per_attention_head)) + bias_padded = torch.cat([first_half_padded, second_half_padded], dim=2) + bias_final = bias_padded.reshape(-1) + return bias_final + + def pad_qkv_weight(self, data): + qkv_weight_first_half = data.reshape( + -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size + )[:, :, :self.half_origin_hidden_size_per_attention_head, :] + qkv_weight_second_half = data.reshape( + -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size + )[:, :, self.half_origin_hidden_size_per_attention_head:, :] + + qkv_weight_first_half_padded = torch.nn.functional.pad( + qkv_weight_first_half, + (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) + qkv_weight_second_half_padded = torch.nn.functional.pad( + qkv_weight_second_half, + (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) + qkv_weight_padded = torch.cat( + [qkv_weight_first_half_padded, qkv_weight_second_half_padded], + dim=2) + qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) + return qkv_weight_final + + def pad_proj_weight(self, data): + out_weight = torch.nn.functional.pad( + data.reshape(self.hidden_size, -1, + self.half_origin_hidden_size_per_attention_head), + (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( + self.hidden_size, -1) + return out_weight + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping: list[tuple[str, str, Union[str, int]]] = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("mlp.gate_up_proj.", "mlp.gate_proj.", 0), + ("mlp.gate_up_proj.", "mlp.up_proj.", 1), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + if ("attn.proj.weight" in name) and self.enable_pad: + param.data = self.pad_proj_weight(param.data) + if ("attn.qkv.weight" in name) and self.enable_pad: + param.data = self.pad_qkv_weight(param.data) + if ("attn.qkv.bias" in name) and self.enable_pad: + param.data = self.pad_qkv_bias(param.data) + loaded_params.add(name) + return loaded_params + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, + 0]).cpu().to(torch.int32) + + # patchify + x = self.patch_embed(x) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # windows attention + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=x.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) + seq_len, _ = x.size() + x = x.reshape(seq_len // self.spatial_merge_unit, + self.spatial_merge_unit, -1) + x = x[window_index, :, :] + x = x.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + + cos, sin = self.cal_cos_sin(rotary_pos_emb) + + # transformers + x = x.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) + + # adapter + x = self.merger(x) + reverse_indices = torch.argsort(window_index) + x = x[reverse_indices, :] + return x + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2_5_VLMultiModalProcessor, + info=Qwen2_5_VLProcessingInfo, + dummy_inputs=Qwen2_5_VLDummyInputsBuilder) +class AscendQwen2_5_VLForConditionalGeneration( + Qwen2_5_VLForConditionalGeneration): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.visual = AscendQwen2_5_VisionTransformer( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return image_embeds.split(sizes.tolist()) + + def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return video_embeds.split(sizes.tolist()) diff --git a/vllm_ascend/models/qwen2_5_vl_without_padding.py b/vllm_ascend/models/qwen2_5_vl_without_padding.py new file mode 100644 index 0000000..5a243e0 --- /dev/null +++ b/vllm_ascend/models/qwen2_5_vl_without_padding.py @@ -0,0 +1,373 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Adapted from vllm/model_executor/models/qwen2_5_vl.py +# Copyright 2023 The vLLM team. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +from typing import Callable, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_npu +from einops import rearrange +from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import ( + Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig) +from vllm.config import VllmConfig +from vllm.distributed import parallel_state +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.activation import get_act_and_mul_fn +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed, + Qwen2_5_VisionTransformer, Qwen2_5_VLDummyInputsBuilder, + Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLMultiModalProcessor, + Qwen2_5_VLProcessingInfo) +from vllm.model_executor.models.utils import maybe_prefix +from vllm.multimodal import MULTIMODAL_REGISTRY + +from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding + + +class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention): + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__( + embed_dim, + num_heads, + projection_size, + quant_config, + prefix, + ) + self.embed_dim = embed_dim + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + # [s, b, c] --> [s, b, head * 3 * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1.dev20250226 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=cu_seqlens, + scale_value=self.hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer) + + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() + + output, _ = self.proj(context_layer) + return output + + +class AscendQwen2_5_VisionBlock_Without_Padding(Qwen2_5_VisionBlock): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_hidden_dim: int, + act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(dim, num_heads, mlp_hidden_dim, act_fn, norm_layer, + quant_config, prefix) + self.attn = AscendQwen2_5_VisionAttention_Without_Padding( + embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, + cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), cu_seqlens=cu_seqlens, cos=cos, sin=sin) + + x = x + self.mlp(self.norm2(x)) + return x + + +class AscendQwen2_5_VisionPatchEmbed_Without_Padding(Qwen2_5_VisionPatchEmbed): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.matmul( + self.proj.weight.data.view(self.hidden_size, -1).transpose(0, 1)) + return x + + +class AscendQwen2_5_VisionTransformer_Without_Padding(Qwen2_5_VisionTransformer + ): + + def __init__( + self, + vision_config: Qwen2_5_VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + interleaved=False, + ) -> None: + super().__init__(vision_config, norm_eps, quant_config, prefix) + norm_layer = partial(RMSNorm, eps=norm_eps) + self.interleaved = interleaved + head_dim = self.hidden_size // self.num_heads + self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim // + 2) + self.patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding( + patch_size=vision_config.patch_size, + temporal_patch_size=vision_config.temporal_patch_size, + in_channels=vision_config.in_channels, + hidden_size=self.hidden_size, + ) + + act_fn = get_act_and_mul_fn(vision_config.hidden_act) + self.blocks = nn.ModuleList([ + AscendQwen2_5_VisionBlock_Without_Padding( + dim=self.hidden_size, + num_heads=self.num_heads, + mlp_hidden_dim=vision_config.intermediate_size, + act_fn=act_fn, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(vision_config.depth) + ]) + self.tp_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() + self.hidden_size_per_attention_head = dist_utils.divide( + self.hidden_size, self.num_heads) + + def cal_cos_sin(self, rotary_pos_emb): + cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] + sin = rotary_pos_emb.sin() + + if not self.interleaved: + cos_new = torch.cat((cos, cos), dim=-1) + sin_new = torch.cat((sin, sin), dim=-1) + else: + cos_new = rearrange(torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2) + sin_new = rearrange(torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2) + cos_new = cos_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + sin_new = sin_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + return cos_new, sin_new + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + pos_ids = [] + for t, h, w in grid_thw: + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids.append( + torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) + pos_ids = torch.cat(pos_ids, dim=0) + max_grid_size = grid_thw[:, 1:].max() + rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + return rotary_pos_emb + + def get_window_index(self, grid_thw): + window_index: list = [] + cu_window_seqlens: list = [0] + window_index_id = 0 + vit_merger_window_size = (self.window_size // + self.spatial_merge_size // self.patch_size) + + for grid_t, grid_h, grid_w in grid_thw: + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + window_index.append(index_new + window_index_id) + cu_seqlens_tmp = seqlens.cumsum( + 0) * self.spatial_merge_unit + cu_window_seqlens[-1] + cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) + window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() + window_index = torch.cat(window_index, dim=0) + return window_index, cu_window_seqlens + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # compute cu_seqlens + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, + 0]).cpu().to(torch.int32) + + # patchify + x = self.patch_embed(x) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + # windows attention + window_index, cu_window_seqlens = self.get_window_index(grid_thw) + cu_window_seqlens = torch.tensor( + cu_window_seqlens, + device=x.device, + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) + cu_window_seqlens = torch.diff(cu_window_seqlens).cpu().to(torch.int32) + seq_len, _ = x.size() + x = x.reshape(seq_len // self.spatial_merge_unit, + self.spatial_merge_unit, -1) + x = x[window_index, :, :] + x = x.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + rotary_pos_emb = rotary_pos_emb[window_index, :, :] + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + + cos, sin = self.cal_cos_sin(rotary_pos_emb) + + # transformers + x = x.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): + if layer_num in self.fullatt_block_indexes: + cu_seqlens_now = cu_seqlens + else: + cu_seqlens_now = cu_window_seqlens + x = blk(x, cu_seqlens=cu_seqlens_now, cos=cos, sin=sin) + + # adapter + x = self.merger(x) + reverse_indices = torch.argsort(window_index) + x = x[reverse_indices, :] + return x + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen2_5_VLMultiModalProcessor, + info=Qwen2_5_VLProcessingInfo, + dummy_inputs=Qwen2_5_VLDummyInputsBuilder) +class AscendQwen2_5_VLForConditionalGeneration_Without_Padding( + Qwen2_5_VLForConditionalGeneration): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.visual = AscendQwen2_5_VisionTransformer_Without_Padding( + vision_config=config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) + + def _process_image_input(self, image_input) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + if image_input["type"] == "image_embeds": + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return image_embeds.split(sizes.tolist()) + + def _process_video_input(self, video_input) -> tuple[torch.Tensor, ...]: + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + if video_input["type"] == "video_embeds": + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + return video_embeds.split(sizes.tolist()) diff --git a/vllm_ascend/models/qwen2_vl.py b/vllm_ascend/models/qwen2_vl.py new file mode 100644 index 0000000..a677b06 --- /dev/null +++ b/vllm_ascend/models/qwen2_vl.py @@ -0,0 +1,352 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from vllm/model_executor/models/qwen2_vl.py +# This file is a part of the vllm-ascend project. + +from collections.abc import Iterable +from functools import partial +from typing import Callable, Optional, Set, Tuple, Type + +import torch +import torch.nn as nn +import torch_npu +from einops import rearrange +from transformers.models.qwen2_vl.configuration_qwen2_vl import \ + Qwen2VLVisionConfig +from vllm.config import VllmConfig +from vllm.distributed import utils as dist_utils +from vllm.model_executor.layers.activation import QuickGELU +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.qwen2_vl import ( + Qwen2VisionAttention, Qwen2VisionBlock, Qwen2VisionPatchEmbed, + Qwen2VisionTransformer, Qwen2VLDummyInputsBuilder, + Qwen2VLForConditionalGeneration, Qwen2VLMultiModalProcessor, + Qwen2VLProcessingInfo) +from vllm.model_executor.models.utils import maybe_prefix +from vllm.multimodal import MULTIMODAL_REGISTRY + +MIN_PAD_SIZE = 64 # min_size to pad weight +MAX_PAD_SIZE = 128 # max_size to pad weight + + +class AscendQwen2VisionAttention(Qwen2VisionAttention): + + def __init__( + self, + embed_dim: int, + num_heads: int, + projection_size: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__( + embed_dim, + num_heads, + projection_size, + quant_config, + prefix, + ) + self.cu_seqlens = None + self.hidden_size_per_attention_head = dist_utils.divide( + projection_size, num_heads) + self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head + if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: + self.hidden_size_per_attention_head = MAX_PAD_SIZE + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + + self.cu_seqlens = cu_seqlens + + # [s, b, c] --> [s, b, 3 * head * head_dim] + x, _ = self.qkv(x) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) + batch_size = q.shape[1] + + q, k, v = [ + rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) + ] + q = torch_npu.npu_rotary_mul(q, cos, sin) + k = torch_npu.npu_rotary_mul(k, cos, sin) + q, k, v = [ + rearrange(x, "b s h d -> (b s) h d").contiguous() + for x in (q, k, v) + ] + + context_layer = torch.empty_like(q) + + # operator requires pta version >= 2.5.1 + torch_npu._npu_flash_attention_unpad( + query=q, + key=k, + value=v, + seq_len=self.cu_seqlens, + scale_value=self.origin_hidden_size_per_attention_head**-0.5, + num_heads=self.num_attention_heads_per_partition, + num_kv_heads=self.num_attention_heads_per_partition, + out=context_layer) + context_layer = rearrange(context_layer, + "(b s) h d -> s b (h d)", + b=batch_size).contiguous() + + output, _ = self.proj(context_layer) + return output + + +class AscendQwen2VisionBlock(Qwen2VisionBlock): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float, + act_layer: Type[nn.Module] = QuickGELU, + norm_layer: Optional[Callable[[int], nn.Module]] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(dim, num_heads, mlp_ratio, act_layer, norm_layer, + quant_config, prefix) + self.attn = AscendQwen2VisionAttention(embed_dim=dim, + num_heads=num_heads, + projection_size=dim, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + cos=cos, + sin=sin, + ) + + x = x + self.mlp(self.norm2(x)) + return x + + +class AscendQwen2VisionPatchEmbed(Qwen2VisionPatchEmbed): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.matmul( + self.proj.weight.data.view(self.embed_dim, -1).transpose(0, 1)) + return x + + +class AscendQwen2VisionTransformer(Qwen2VisionTransformer): + + def __init__( + self, + vision_config: Qwen2VLVisionConfig, + norm_eps: float = 1e-6, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + interleaved=False, + ) -> None: + super().__init__(vision_config, norm_eps, quant_config, prefix) + + self.interleaved = interleaved + self.enable_pad = False + self.depth = vision_config.depth + self.hidden_size = vision_config.embed_dim + self.num_heads = vision_config.num_heads + self.patch_embed = AscendQwen2VisionPatchEmbed( + patch_size=vision_config.patch_size, + temporal_patch_size=vision_config.temporal_patch_size, + in_channels=vision_config.in_channels, + embed_dim=vision_config.embed_dim, + ) + + self.blocks = nn.ModuleList([ + AscendQwen2VisionBlock(dim=self.embed_dim, + num_heads=self.num_heads, + mlp_ratio=vision_config.mlp_ratio, + norm_layer=partial(nn.LayerNorm, + eps=norm_eps), + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(vision_config.depth) + ]) + + self.hidden_size_per_attention_head = dist_utils.divide( + self.hidden_size, self.num_heads) + + if self.hidden_size_per_attention_head > MIN_PAD_SIZE and self.hidden_size_per_attention_head < MAX_PAD_SIZE: + self.enable_pad = True + self.origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head + self.half_origin_hidden_size_per_attention_head = self.hidden_size_per_attention_head // 2 + self.half_pad_hidden_size_per_attention_head = ( + MAX_PAD_SIZE - self.hidden_size_per_attention_head) // 2 + self.hidden_size_per_attention_head = MAX_PAD_SIZE + + def cal_cos_sin(self, rotary_pos_emb): + cos = rotary_pos_emb.cos() # [seqlen, rotary_dim / 2] + sin = rotary_pos_emb.sin() + if self.enable_pad: + cos = torch.nn.functional.pad( + cos, (0, self.half_pad_hidden_size_per_attention_head)) + sin = torch.nn.functional.pad( + sin, (0, self.half_pad_hidden_size_per_attention_head)) + + if not self.interleaved: + cos_new = torch.cat((cos, cos), dim=-1) + sin_new = torch.cat((sin, sin), dim=-1) + else: + cos_new = rearrange(torch.stack((cos, cos), dim=-1), + "... d two -> ...(d two)", + two=2) + sin_new = rearrange(torch.stack((sin, sin), dim=-1), + "... d two -> ...(d two)", + two=2) + cos_new = cos_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + sin_new = sin_new.reshape(1, -1, 1, + self.hidden_size_per_attention_head) + return cos_new, sin_new + + def pad_qkv_bias(self, bias): + first_half = bias.reshape( + -1, 3, self.origin_hidden_size_per_attention_head + )[:, :, :self.half_origin_hidden_size_per_attention_head] + second_half = bias.reshape( + -1, 3, self.origin_hidden_size_per_attention_head + )[:, :, self.half_origin_hidden_size_per_attention_head:] + first_half_padded = torch.nn.functional.pad( + first_half, (0, self.half_pad_hidden_size_per_attention_head)) + second_half_padded = torch.nn.functional.pad( + second_half, (0, self.half_pad_hidden_size_per_attention_head)) + bias_padded = torch.cat([first_half_padded, second_half_padded], dim=2) + bias_final = bias_padded.reshape(-1) + return bias_final + + def pad_qkv_weight(self, data): + qkv_weight_first_half = data.reshape( + -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size + )[:, :, :self.half_origin_hidden_size_per_attention_head, :] + qkv_weight_second_half = data.reshape( + -1, 3, self.origin_hidden_size_per_attention_head, self.hidden_size + )[:, :, self.half_origin_hidden_size_per_attention_head:, :] + + qkv_weight_first_half_padded = torch.nn.functional.pad( + qkv_weight_first_half, + (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) + qkv_weight_second_half_padded = torch.nn.functional.pad( + qkv_weight_second_half, + (0, 0, 0, self.half_pad_hidden_size_per_attention_head)) + qkv_weight_padded = torch.cat( + [qkv_weight_first_half_padded, qkv_weight_second_half_padded], + dim=2) + qkv_weight_final = qkv_weight_padded.reshape(-1, self.hidden_size) + return qkv_weight_final + + def pad_proj_weight(self, data): + out_weight = torch.nn.functional.pad( + data.reshape(self.hidden_size, -1, + self.half_origin_hidden_size_per_attention_head), + (0, self.half_pad_hidden_size_per_attention_head, 0, 0)).reshape( + self.hidden_size, -1) + return out_weight + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + if ("attn.proj.weight" in name) and self.enable_pad: + param.data = self.pad_proj_weight(param.data) + if ("attn.qkv.weight" in name) and self.enable_pad: + param.data = self.pad_qkv_weight(param.data) + if ("attn.qkv.bias" in name) and self.enable_pad: + param.data = self.pad_qkv_bias(param.data) + loaded_params.add(name) + return loaded_params + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + # compute cu_seqlens and avoid cumsum to fit operator unpadFA + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], + grid_thw[:, + 0]).cpu().to(torch.int32) + + # patchify + x = x.to(device=self.device, dtype=self.dtype) + x = self.patch_embed(x) + + # compute position embedding + rotary_pos_emb = self.rot_pos_emb(grid_thw) + cos, sin = self.cal_cos_sin(rotary_pos_emb) + + x = x.unsqueeze(1) + for blk in self.blocks: + x = blk(x, cu_seqlens=cu_seqlens, cos=cos, sin=sin) + + # adapter + x = self.merger(x) + return x + + +@MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor, + info=Qwen2VLProcessingInfo, + dummy_inputs=Qwen2VLDummyInputsBuilder) +class AscendQwen2VLForConditionalGeneration(Qwen2VLForConditionalGeneration): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.visual = AscendQwen2VisionTransformer( + self.config.vision_config, + norm_eps=getattr(self.config, "rms_norm_eps", 1e-6), + quant_config=self._maybe_ignore_quant_config( + vllm_config.quant_config), + prefix=maybe_prefix(prefix, "visual"), + ) \ No newline at end of file diff --git a/vllm_ascend/models/qwen3.py b/vllm_ascend/models/qwen3.py new file mode 100644 index 0000000..a05106f --- /dev/null +++ b/vllm_ascend/models/qwen3.py @@ -0,0 +1,156 @@ +from collections.abc import Iterable +from typing import Optional, Union + +import torch +from torch import nn +from transformers import Qwen3Config +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP +from vllm.model_executor.models.qwen2 import Qwen2Model +from vllm.model_executor.models.qwen3 import Qwen3DecoderLayer +from vllm.model_executor.models.utils import (AutoWeightsLoader, + PPMissingLayer, maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ops.layernorm import AddRMSNormW8A8Quant + + +class CustomQwen3DecoderLayer(Qwen3DecoderLayer): + + def __init__( + self, + config: Qwen3Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix) + if quant_config is None: + return + + from vllm_ascend.quantization.quant_config import AscendQuantConfig + from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod + + assert isinstance(quant_config, AscendQuantConfig), \ + "Expected quant_config to be an instance of AscendQuantConfig" + + if isinstance(self.self_attn.qkv_proj.quant_method.quant_method, + AscendW8A8LinearMethod): + self.input_layernorm = AddRMSNormW8A8Quant( + config.hidden_size, + layer=self.self_attn.qkv_proj, + eps=config.rms_norm_eps) + if isinstance(self.mlp.gate_up_proj.quant_method.quant_method, + AscendW8A8LinearMethod): + self.post_attention_layernorm = AddRMSNormW8A8Quant( + config.hidden_size, + layer=self.mlp.gate_up_proj, + eps=config.rms_norm_eps) + + +ALL_DECODER_LAYER_TYPES = { + "attention": CustomQwen3DecoderLayer, +} + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) +class CustomQwen3Model(Qwen2Model): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + decoder_layer_type=CustomQwen3DecoderLayer) + + +class CustomQwen3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + # add `CustomQwen3Model` to init self.model + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = CustomQwen3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py new file mode 100644 index 0000000..2fa10f0 --- /dev/null +++ b/vllm_ascend/models/qwen3_moe.py @@ -0,0 +1,393 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from vllm/model_executor/models/qwen3_moe.py +# This file is a part of the vllm-ascend project. + +from typing import Optional, Union + +import torch +from torch import nn +from transformers import PretrainedConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, CompilationLevel, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.models.interfaces import (MixtureOfExperts, + SupportsLoRA, SupportsPP) +from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention, + Qwen3MoeDecoderLayer, + Qwen3MoeForCausalLM, + Qwen3MoeMLP, Qwen3MoeModel, + Qwen3MoeSparseMoeBlock) +from vllm.model_executor.models.utils import ( + PPMissingLayer, extract_layer_index, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, + init_metadata_for_sp) +from vllm_ascend.utils import vllm_version_is + + +class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + nn.Module.__init__(self) + self.tp_size = get_tensor_model_parallel_world_size() + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + self.experts = AscendFusedMoE( + num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + ) + + self.top_k = config.num_experts_per_tok + + self.dp_size = get_dp_group().world_size + + self.tp_group = get_tp_group().device_group + self.tp_rank = get_tp_group().rank_in_group + self.ep_group = get_ep_group() + + self.params_dtype = torch.get_default_dtype() + + def forward( + self, + hidden_states, + attn_metadata=None, + _metadata_for_padding: Optional[MetadataForPadding] = None, + ): + if attn_metadata is None: + attn_metadata = get_forward_context().attn_metadata + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + enable_force_load_balance = get_forward_context().in_profile_run + is_prefill = get_forward_context().with_prefill + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + + hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=self.top_k, + enable_force_load_balance=enable_force_load_balance, + shared_experts=None, + _metadata_for_padding=_metadata_for_padding, + ) + + return hidden_states + + +class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + vllm_config: Optional[VllmConfig] = None, + prefix: str = "", + ) -> None: + + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = Qwen3MoeAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, 'attention_bias', False), + head_dim=getattr(config, 'head_dim', None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + # `mlp_only_layers` in the config. + layer_idx = extract_layer_index(prefix) + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + self.use_aclgraph = (vllm_config is not None + and vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE + and not vllm_config.model_config.enforce_eager) + if (layer_idx not in mlp_only_layers) and ( + config.num_experts > 0 and + (layer_idx + 1) % config.decoder_sparse_step == 0): + if not self.use_aclgraph: + # FIXME: custom sparse moe block doesn't work with aclgraph. + self.mlp = CustomSparseMoeBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + else: + self.mlp = Qwen3MoeSparseMoeBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + else: + self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + self.enable_sequence_parallelism = ( + vllm_config.compilation_config.pass_config. + enable_sequence_parallelism if vllm_config is not None else False) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + _metadata_for_padding: Optional[MetadataForPadding] = None, + ) -> torch.Tensor: + + # To prevent precision issues during the decoder phase when only prefilling enables SP + if not self.enable_sequence_parallelism: + self.self_attn.o_proj.reduce_results = True + else: + self.self_attn.o_proj.reduce_results = not _metadata_for_padding.not_dummy_and_is_prefill if _metadata_for_padding is not None else True + + # Self Attention + if residual is None: + residual = hidden_states + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + residual = _metadata_for_padding.padding_slice(residual) + + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.allgather_unpadding_aligned( + hidden_states) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.padding_aligned_reduce_scatter( + hidden_states) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if not self.use_aclgraph: + hidden_states = self.mlp( + hidden_states, _metadata_for_padding=_metadata_for_padding) + else: + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class CustomQwen3MoeModel(Qwen3MoeModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + parallel_config = vllm_config.parallel_config + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + self.num_redundant_experts = parallel_config.num_redundant_experts + else: + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + prefix=f"{prefix}.embed_tokens") + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: CustomQwen3MoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + vllm_config=vllm_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + _metadata_for_padding: Optional[MetadataForPadding] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + residual, + _metadata_for_padding=_metadata_for_padding) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.allgather_unpadding_aligned( + hidden_states) + + return hidden_states + + +class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + SupportsPP.__init__(self) + SupportsLoRA.__init__(self) + MixtureOfExperts.__init__(self) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = CustomQwen3MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism + # Set MoE hyperparameters + self.expert_weights: list[torch.Tensor] = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3MoeDecoderLayer) + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3MoE layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + _metadata_for_padding = init_metadata_for_sp( + input_ids, self.enable_sequence_parallelism) + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds, _metadata_for_padding) + return hidden_states diff --git a/vllm_ascend/multistream/__init__.py b/vllm_ascend/multistream/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/multistream/base.py b/vllm_ascend/multistream/base.py new file mode 100644 index 0000000..fba58b4 --- /dev/null +++ b/vllm_ascend/multistream/base.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass +from enum import Enum + + +class MSEventKey(Enum): + ATTN_COM_FINISH = 0 + ATTN_AR_FINISH = 1 + FFN_COM_FINISH = 2 + FFN_AR_FINISH = 3 + # events for MOE dispatch and combine + MOE_BEFORE_COMM = 4 + MOE_AFTER_COMM = 5 + # events for shared expert + MOE_SE_COMM_FINISH = 6 + MOE_SE_COMP_FINISH = 7 + MOE_GATE_FINISH = 8 + + +@dataclass +class MSAttentionMetadataSplitConfig: + """ + micro batch split config for split attention metadata + """ + # micro batch num + num_micro_batches: int = 2 + # split micro batches only when total tokens >= min_total_tokens_to_split + min_total_tokens_to_split: int = 256 + # split micro batches only when prefill tokens >= min_prefill_tokens_to_split + min_prefill_tokens_to_split: int = 64 diff --git a/vllm_ascend/multistream/context.py b/vllm_ascend/multistream/context.py new file mode 100644 index 0000000..a1684f2 --- /dev/null +++ b/vllm_ascend/multistream/context.py @@ -0,0 +1,67 @@ +from contextlib import contextmanager +from typing import Any + +_ms_comm_context: Any = None +_cur_micro_batch_num: int = -1 +_ms_layer_index_context: int = -1 +_ms_metadata_context: Any = None +_ms_attn_metadata_context: Any = None + + +def set_multistream_layer_context(start_layer: int, ms_metadata: Any, + attn_metadata: Any): + """ + set multistream layer context before transformer layers + """ + global _ms_layer_index_context, _ms_metadata_context, _ms_attn_metadata_context + _ms_layer_index_context = start_layer + _ms_metadata_context = ms_metadata + _ms_attn_metadata_context = attn_metadata + + +def reset_multistream_layer_context(): + """ + reset multistream layer context + """ + global _ms_layer_index_context, _ms_metadata_context, _ms_attn_metadata_context + _ms_layer_index_context = -1 + _ms_metadata_context = None + _ms_attn_metadata_context = None + + +def get_multistream_layer_context(): + """ + get multistream layer context + """ + return _ms_layer_index_context, _ms_metadata_context, _ms_attn_metadata_context + + +def advance_step_multistream_layer_context(): + """ + advance multistream layer index context + """ + global _ms_layer_index_context + _ms_layer_index_context += 1 + + +def get_multistream_comm_context() -> Any: + """Get the current comm forward context.""" + return _ms_comm_context + + +def get_multistream_microbatch_context() -> int: + return _cur_micro_batch_num + + +@contextmanager +def set_multistream_context(context: Any, micro_batch_num: int): + """A context manager that stores the current comm forward context, + can be attention metadata, etc.""" + global _ms_comm_context, _cur_micro_batch_num + _ms_comm_context = context + _cur_micro_batch_num = micro_batch_num + try: + yield + finally: + _ms_comm_context = None + _cur_micro_batch_num = -1 diff --git a/vllm_ascend/multistream/decorator.py b/vllm_ascend/multistream/decorator.py new file mode 100644 index 0000000..5b573df --- /dev/null +++ b/vllm_ascend/multistream/decorator.py @@ -0,0 +1,22 @@ +from .context import (get_multistream_layer_context, + get_multistream_microbatch_context) + + +# vllm v1 use get_forward_context to get the attn_metadata, +# we can use this decorator to update the attn metadata +def set_multistream_support(): + + def decorator(func): + + def wrapper(): + context = func() + layer_index, ms_metadata, attn_metadata = get_multistream_layer_context( + ) + micro_batch_num = get_multistream_microbatch_context() + if layer_index != -1 and micro_batch_num != -1: + context.attn_metadata = attn_metadata[micro_batch_num] + return context + + return wrapper + + return decorator diff --git a/vllm_ascend/multistream/layers.py b/vllm_ascend/multistream/layers.py new file mode 100644 index 0000000..c5273bc --- /dev/null +++ b/vllm_ascend/multistream/layers.py @@ -0,0 +1,61 @@ +from typing import List, Optional, Tuple, Union + +import torch +from vllm.forward_context import get_forward_context + +from .base import MSEventKey +from .context import (get_multistream_layer_context, + reset_multistream_layer_context, + set_multistream_layer_context) +from .metadata import MultiStreamMetadata + + +class MultiStreamPreTransformerLayer(torch.nn.Module): + + def __init__(self, multistream_metadata: MultiStreamMetadata): + super().__init__() + self.multistream_metadata = multistream_metadata + + def forward( + self, + intput_tensors: List[torch.Tensor], + ): + attn_metadata = get_forward_context().attn_metadata + if self.multistream_metadata is None or attn_metadata is None: + set_multistream_layer_context(-1, None, None) + return attn_metadata, intput_tensors + # TODO add attn_metadata management + do_ms, attn_metadata, intput_tensors, _ = self.multistream_metadata.split_micro_batch( + attn_metadata, intput_tensors) + if do_ms: + set_multistream_layer_context( + self.multistream_metadata.start_layer, + self.multistream_metadata, attn_metadata) + else: + set_multistream_layer_context(-1, None, None) + return attn_metadata, intput_tensors + + +class MultiStreamPostTransformerLayer(torch.nn.Module): + + def __init__(self, multistream_metadata: MultiStreamMetadata): + super().__init__() + self.multistream_metadata = multistream_metadata + + def forward(self, + input_tensors: Union[List[Tuple[torch.Tensor]], + List[torch.Tensor], + List[List[torch.Tensor]]], + wait_layer_index: Optional[int] = None): + if self.multistream_metadata is None or self.multistream_metadata.ms_config is None: + return input_tensors + layer_index, ms_metadata, ms_attn_metadata = get_multistream_layer_context( + ) + if layer_index >= 0: + true_wait_layer = self.multistream_metadata.end_layer - 1 if wait_layer_index is None else wait_layer_index + self.multistream_metadata.try_wait_event( + true_wait_layer, + self.multistream_metadata.ms_config.num_micro_batches - 1, + MSEventKey.FFN_AR_FINISH) + reset_multistream_layer_context() + return self.multistream_metadata.merge_micro_batches(input_tensors) diff --git a/vllm_ascend/multistream/metadata.py b/vllm_ascend/multistream/metadata.py new file mode 100644 index 0000000..b521d3f --- /dev/null +++ b/vllm_ascend/multistream/metadata.py @@ -0,0 +1,182 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +import torch +from vllm.sequence import IntermediateTensors + +from vllm_ascend.attention.mla_v1 import AscendMLAMetadata + +from .base import MSAttentionMetadataSplitConfig, MSEventKey + + +def split_micro_batches_tensors(input_tensors, + split_index: int, + keys: Optional[List[str]] = None): + if isinstance(input_tensors, list): + micro_batches = [] + for tensor in input_tensors: + if tensor is None: + micro_batches.append([None, None]) + else: + micro_batches.append( + [tensor[:split_index], tensor[split_index:]]) + return micro_batches + elif isinstance(input_tensors, torch.Tensor): + return [input_tensors[:split_index], input_tensors[split_index:]] + elif input_tensors is None: + return [None, None] + elif isinstance(input_tensors, Dict): + assert keys is not None + micro_batches_pre = {} + for key in keys: + micro_batches_pre[key] = input_tensors[key][:split_index] + micro_batches_post = {} + for key in keys: + micro_batches_post[key] = input_tensors[key][split_index:] + return [micro_batches_pre, micro_batches_post] + else: + raise NotImplementedError + + +@dataclass +class MultiStreamStepMetadata: + comm_stream: torch.npu.Stream = None + before_comm_event: torch.npu.Event = None + after_comm_event: torch.npu.Event = None + + +@dataclass +class MultiStreamConfig: + """Controls the behavior of multi-stream models.""" + min_total_tokens_to_split: int = 256 + min_prefill_tokens_to_split: int = 64 + num_micro_batches: int = 2 + imbalance_ratio: float = 0.1 + + +class MultiStreamMetadata: + # direct stream + calculate_stream = None + # delay stream + communicate_stream = None + # events + ms_events: Dict[int, Dict[int, Dict[MSEventKey, torch.npu.Event]]] = {} + # multi-stream-flag + enable_multi_stream: bool = False + + def __init__( + self, + calculate_stream: torch.npu.Stream, + communicate_stream: torch.npu.Stream, + start_layer: int, + end_layer: int, + event_keys: List[MSEventKey], + multistream_config: Optional[MultiStreamConfig], + causal_lm: bool = True, + ): + self.calculate_stream = calculate_stream + self.communicate_stream = communicate_stream + self.start_layer = start_layer + self.end_layer = end_layer + self.ms_config = multistream_config + self.causal_lm = causal_lm + self._build_events(event_keys) + self._build_ms_split_config() + + def _build_events(self, event_keys): + if self.ms_config is not None: + for i in range(self.start_layer - 1, self.end_layer): + self.ms_events[i] = {} + for j in range(self.ms_config.num_micro_batches): + self.ms_events[i][j] = {} + for key in event_keys: + self.ms_events[i][j][key] = torch.npu.Event() + + def _build_ms_split_config(self): + if self.ms_config is not None: + self.ms_split_config = MSAttentionMetadataSplitConfig( + num_micro_batches=self.ms_config.num_micro_batches, + min_total_tokens_to_split=self.ms_config. + min_total_tokens_to_split, + min_prefill_tokens_to_split=self.ms_config. + min_prefill_tokens_to_split, + ) + + def try_wait_event(self, layer_index: int, micro_batch_index: int, + event_key: MSEventKey): + self.ms_events[layer_index][micro_batch_index][event_key].wait() + + def try_record_event(self, layer_index: int, micro_batch_index: int, + event_key: MSEventKey): + self.ms_events[layer_index][micro_batch_index][event_key].record() + + def split_micro_batch( + self, + attn_metadata: "AscendMLAMetadata", + intput_tensors: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + intermediate_tensors_keys: Optional[List[str]] = None, + ) -> Tuple[bool, Union[AscendMLAMetadata, List[AscendMLAMetadata]], Union[ + List[torch.Tensor], List[List[torch.Tensor]]], Union[ + IntermediateTensors, List[IntermediateTensors]]]: + attn_metadata_list = attn_metadata.split_metadata_for_multistream( + self.ms_split_config) + if len(attn_metadata_list) == 1: + return False, attn_metadata_list[ + 0], intput_tensors, intermediate_tensors + split_index = attn_metadata_list[0].slot_mapping.shape[0] + input_tensors = split_micro_batches_tensors(intput_tensors, + split_index) + if intermediate_tensors is not None: + inter_tensors_list = split_micro_batches_tensors( + intermediate_tensors.tensors, split_index, + intermediate_tensors_keys) + intermediate_tensors = [ + IntermediateTensors(inter_tensors) + for inter_tensors in inter_tensors_list + ] + return True, attn_metadata_list, input_tensors, intermediate_tensors + + def merge_micro_batches( + self, input_tensors: Union[List[torch.Tensor], + List[List[torch.Tensor]]] + ) -> List[torch.Tensor]: + if input_tensors is None or isinstance(input_tensors[0], torch.Tensor): + return input_tensors + batch: List[Optional[torch.Tensor]] = [] + for tensors in input_tensors: + if tensors is None or tensors[0] is None: + batch.append(None) + else: + batch.append(torch.cat(tensors, dim=0)) + return batch + + +def make_multistream_metadata_ds( + start_layer: int, + end_layer: int, + causal_lm: bool = True, + multistream_config: Optional[MultiStreamConfig] = None, +): + if multistream_config is None: + return None + event_keylist = [ + MSEventKey.ATTN_COM_FINISH, + MSEventKey.ATTN_AR_FINISH, + MSEventKey.FFN_COM_FINISH, + MSEventKey.FFN_AR_FINISH, + MSEventKey.MOE_BEFORE_COMM, + MSEventKey.MOE_AFTER_COMM, + MSEventKey.MOE_SE_COMM_FINISH, + MSEventKey.MOE_SE_COMP_FINISH, + MSEventKey.MOE_GATE_FINISH, + ] + return MultiStreamMetadata( + calculate_stream=torch.npu.current_stream(), + communicate_stream=torch.npu.Stream(), + start_layer=start_layer, + end_layer=end_layer, + multistream_config=multistream_config, + event_keys=event_keylist, + causal_lm=causal_lm, + ) diff --git a/vllm_ascend/multistream/ms_split.py b/vllm_ascend/multistream/ms_split.py new file mode 100644 index 0000000..b7b356b --- /dev/null +++ b/vllm_ascend/multistream/ms_split.py @@ -0,0 +1,247 @@ +from copy import deepcopy +from typing import Any, List, Optional + +import numpy as np +import torch + +from vllm_ascend.attention.attention_v1 import AscendAttentionState + +from .base import MSAttentionMetadataSplitConfig + + +def compute_split_seq_index( + query_lens: Optional[list[int]], + attn_state: AscendAttentionState, + num_tokens: int, + imbalance_ratio: float = 0.1, +) -> list[int]: + if attn_state != AscendAttentionState.DecodeOnly: + assert query_lens is not None + total_tokens = sum(query_lens) + # the first index in last split + tokens, split_index = 0, 0 + for value in query_lens: + tokens += value + split_index += 1 + if tokens >= total_tokens // 2: + # check the current split index + if abs(tokens - + total_tokens // 2) < total_tokens * imbalance_ratio: + return [tokens, split_index] + # check the previous split index + elif abs(tokens - total_tokens // 2 - + value) < total_tokens * imbalance_ratio: + return [tokens - value, split_index - 1] + # fail to split if it is imbalanced + # TODO: split tokens in seq + else: + return [0, 0] + else: + tokens = num_tokens // 2 + return [tokens, tokens] + return [0, 0] + + +def split_attn_tensor_type( + input_tensor: torch.Tensor, + index: int, +) -> List[torch.Tensor]: + return [input_tensor[:index], input_tensor[index:]] + + +def split_attn_int_type( + var: int, + index: int, +) -> List[torch.Tensor]: + return [min(var, index), max(var - index, 0)] + + +def model_input_split_v1_mla_attn( + attn_metadata, + _metadata_cls, + ms_split_config: MSAttentionMetadataSplitConfig, +) -> List[Any]: + assert 0 < ms_split_config.num_micro_batches < 3 + if attn_metadata is None: + return [attn_metadata] + [token_index, + seq_index] = compute_split_seq_index(attn_metadata.query_lens, + attn_metadata.attn_state, + attn_metadata.num_decode_tokens) + if token_index == 0 or seq_index == 0 or seq_index == len( + attn_metadata.query_lens): + return [attn_metadata] + + query_start_loc_cpu = np.zeros(shape=(len(attn_metadata.query_lens) + 1, ), + dtype=int) + np.cumsum(attn_metadata.query_lens, out=query_start_loc_cpu[1:]) + if attn_metadata.num_prefills > 0: + prefill_query_start_loc = np.zeros( + shape=(len(attn_metadata.prefill.query_lens) + 1, ), dtype=int) + np.cumsum(attn_metadata.prefill.query_lens, + out=prefill_query_start_loc[1:]) + + # split attn metadata + [slot_mapping_pre, + slot_mapping_post] = split_attn_tensor_type(attn_metadata.slot_mapping, + token_index) + [num_decodes_pre, + num_decodes_post] = split_attn_int_type(attn_metadata.num_decodes, + seq_index) + [num_decode_tokens_pre, num_decode_tokens_post + ] = split_attn_int_type(attn_metadata.num_decode_tokens, token_index) + [num_prefills_pre, num_prefills_post + ] = split_attn_int_type(attn_metadata.num_prefills, + max(0, seq_index - attn_metadata.num_decodes)) + seq_lens = attn_metadata.prefill.seq_lens if attn_metadata.num_prefills > 0 else attn_metadata.decode.seq_lens + [seq_lens_pre, seq_lens_post] = split_attn_tensor_type(seq_lens, seq_index) + + query_start_loc_pre = query_start_loc_post = None + if attn_metadata.query_start_loc is not None: + query_start_loc_pre = attn_metadata.query_start_loc[:seq_index + 1] + query_start_loc_post = deepcopy( + attn_metadata.query_start_loc[seq_index:] + ) - attn_metadata.query_start_loc[seq_index] + [block_table_pre, + block_table_post] = split_attn_tensor_type(attn_metadata.block_tables, + seq_index) + assert attn_metadata.attn_mask is not None + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache or attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit: + # the attn_mla kernel in torch npu only accept 128*128 attn mask + attn_mask_pre = attn_mask_post = attn_metadata.attn_mask + attn_state_pre = attn_state_post = attn_metadata.attn_state + elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: + # should be none in decode only state + attn_mask_pre = attn_mask_post = attn_metadata.attn_mask + attn_state_pre = attn_state_post = AscendAttentionState.DecodeOnly + else: + # chunked prefill + if num_prefills_pre > 0: + attn_state_pre = attn_state_post = AscendAttentionState.ChunkedPrefill + attn_mask_pre = attn_metadata.attn_mask[:token_index, :max( + seq_lens_pre)].contiguous() + attn_state_post = AscendAttentionState.ChunkedPrefill + attn_mask_post = attn_metadata.attn_mask[ + token_index:, :max(seq_lens_post)].contiguous() + else: + attn_state_pre = AscendAttentionState.DecodeOnly + attn_mask_pre = None + attn_state_post = AscendAttentionState.ChunkedPrefill + attn_mask_post = attn_metadata.attn_mask[ + token_index:, :max(seq_lens_post)].contiguous() + from vllm_ascend.attention.mla_v1 import (AscendMLADecodeMetadata, + AscendMLAPrefillMetadata) + if num_prefills_pre > 0: + # split metadata.prefill + [input_positions_pre, input_positions_post] = split_attn_tensor_type( + attn_metadata.prefill.input_positions, + token_index - attn_metadata.num_decode_tokens) + [block_tables_pre, block_tables_post + ] = split_attn_tensor_type(attn_metadata.prefill.block_table, + seq_index - attn_metadata.num_decodes) + [prefill_query_lens_pre, prefill_query_lens_post + ] = split_attn_tensor_type(attn_metadata.prefill.query_lens, + seq_index - attn_metadata.num_decodes) + prefill_query_start_loc_pre = attn_metadata.prefill.query_start_loc[: + seq_index + + + 1 - + attn_metadata + . + num_decodes] + prefill_query_start_loc_post = deepcopy( + attn_metadata.prefill.query_start_loc[seq_index - + attn_metadata.num_decodes:] + ) - attn_metadata.prefill.query_start_loc[seq_index - + attn_metadata.num_decodes] + context_len_pre = seq_lens_pre[attn_metadata.num_decodes:] + context_len_post = seq_lens_post + prefill_max_query_len_pre = max(prefill_query_lens_pre) + prefill_max_query_len_post = max(prefill_query_lens_post) + prefill_pre = AscendMLAPrefillMetadata( + attn_mask=attn_mask_pre, + query_lens=prefill_query_lens_pre, + seq_lens=seq_lens_pre, + query_start_loc=prefill_query_start_loc_pre, + input_positions=input_positions_pre, + context_lens=context_len_pre, + block_table=block_tables_pre, + max_query_len=prefill_max_query_len_pre, + max_seq_lens=context_len_pre.max().item(), + ) + prefill_post = AscendMLAPrefillMetadata( + attn_mask=attn_mask_post, + query_lens=prefill_query_lens_post, + seq_lens=seq_lens_post, + query_start_loc=prefill_query_start_loc_post, + input_positions=input_positions_post, + context_lens=context_len_post, + block_table=block_tables_post, + max_query_len=prefill_max_query_len_post, + max_seq_lens=context_len_post.max().item(), + ) + decode_pre = attn_metadata.decode + decode_post = None + else: + # prefill is None, split metadata.decode + [input_positions_pre, input_positions_post + ] = split_attn_tensor_type(attn_metadata.decode.input_positions, + token_index) + [block_tables_pre, block_tables_post + ] = split_attn_tensor_type(attn_metadata.decode.block_table, + seq_index) + [decode_seq_lens_pre, + decode_seq_lens_post] = split_attn_tensor_type(seq_lens, seq_index) + decode_pre = AscendMLADecodeMetadata( + input_positions=input_positions_pre, + block_table=block_tables_pre, + seq_lens=decode_seq_lens_pre, + max_seq_lens=max(decode_seq_lens_pre), + seq_lens_list=decode_seq_lens_pre.tolist(), + ) + decode_post = AscendMLADecodeMetadata( + input_positions=input_positions_post, + block_table=block_tables_post, + seq_lens=decode_seq_lens_post, + max_seq_lens=max(decode_seq_lens_post), + seq_lens_list=decode_seq_lens_post.tolist(), + ) + prefill_pre = None + prefill_post = attn_metadata.prefill + # construct metadata + from vllm_ascend.attention.mla_v1 import AscendMLAPrefillMetadata + attention_metadata_pre = _metadata_cls( + num_actual_tokens=token_index, + num_input_tokens=token_index, + head_dim=attn_metadata.head_dim, + slot_mapping=slot_mapping_pre, + seq_lens=seq_lens_pre, + query_start_loc=query_start_loc_pre, + block_tables=block_table_pre, + num_decodes=num_decodes_pre, + num_prefills=num_prefills_pre, + num_decode_tokens=num_decode_tokens_pre, + attn_state=attn_state_pre, + attn_mask=attn_mask_pre, + prefill=prefill_pre, + decode=decode_pre, + enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, + ) + attention_metadata_post = _metadata_cls( + num_actual_tokens=attn_metadata.num_actual_tokens - token_index, + num_input_tokens=attn_metadata.num_input_tokens - token_index, + head_dim=attn_metadata.head_dim, + slot_mapping=slot_mapping_post, + seq_lens=seq_lens_post, + query_start_loc=query_start_loc_post, + block_tables=block_table_post, + num_decodes=num_decodes_post, + num_prefills=num_prefills_post, + num_decode_tokens=num_decode_tokens_post, + attn_mask=attn_mask_post, + attn_state=attn_state_post, + prefill=prefill_post, + decode=decode_post, + enable_dbo_across_dp=attn_metadata.enable_dbo_across_dp, + ) + return [attention_metadata_pre, attention_metadata_post] diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py new file mode 100644 index 0000000..a1e7417 --- /dev/null +++ b/vllm_ascend/ops/__init__.py @@ -0,0 +1,56 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch + +import vllm_ascend.ops.common_fused_moe # noqa +import vllm_ascend.ops.fused_moe # noqa +import vllm_ascend.ops.layernorm # noqa +import vllm_ascend.ops.vocab_parallel_embedding # noqa +from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul +from vllm_ascend.ops.rotary_embedding import ( + AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding) + + +class dummyFusionOp: + default = None + + def __init__(self, name=""): + self.name = name + + +def register_dummy_fusion_op() -> None: + torch.ops._C.rms_norm = dummyFusionOp(name="rms_norm") + torch.ops._C.fused_add_rms_norm = dummyFusionOp(name="fused_add_rms_norm") + torch.ops._C.static_scaled_fp8_quant = dummyFusionOp( + name="static_scaled_fp8_quant") + torch.ops._C.dynamic_scaled_fp8_quant = dummyFusionOp( + name="dynamic_scaled_fp8_quant") + torch.ops._C.dynamic_per_token_scaled_fp8_quant = dummyFusionOp( + name="dynamic_per_token_scaled_fp8_quant") + torch.ops._C.rms_norm_static_fp8_quant = dummyFusionOp( + name="rms_norm_static_fp8_quant") + torch.ops._C.fused_add_rms_norm_static_fp8_quant = dummyFusionOp( + name="fused_add_rms_norm_static_fp8_quant") + torch.ops._C.rms_norm_dynamic_per_token_quant = dummyFusionOp( + name="rms_norm_dynamic_per_token_quant") + + +__all__ = [ + "AscendQuickGELU", "AscendSiluAndMul", "AscendRotaryEmbedding", + "AscendDeepseekScalingRotaryEmbedding" +] diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py new file mode 100644 index 0000000..26082fe --- /dev/null +++ b/vllm_ascend/ops/activation.py @@ -0,0 +1,42 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import torch +from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul + + +class AscendQuickGELU(QuickGELU): + + def forward_oot(self, x: torch.tensor) -> torch.Tensor: + import torch_npu + + out = torch_npu.npu_fast_gelu(x) + return out + + +class AscendSiluAndMul(SiluAndMul): + + def forward_oot(self, x: torch.Tensor) -> torch.Tensor: + import torch_npu + + from vllm_ascend.utils import is_310p + + if is_310p(): + out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16) + else: + out = torch_npu.npu_swiglu(x) + return out diff --git a/vllm_ascend/ops/attention.py b/vllm_ascend/ops/attention.py new file mode 100644 index 0000000..05600ae --- /dev/null +++ b/vllm_ascend/ops/attention.py @@ -0,0 +1,309 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/kernels/test_moe.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Tuple + +import torch +from vllm.model_executor.layers.linear import ColumnParallelLinear + + +# Implementation of vanilla chunked prefill, should be removed after the kernel is ready for +# all the corner case +def vanilla_chunked_prefill( + output: torch.Tensor, + query: torch.Tensor, # (num_tokens, heads, head_size) + key_cache: torch.Tensor, # (num_blocks, block_size, kv_heads, head_size) + value_cache: torch. + Tensor, # (num_blocks, block_size, kv_heads, head_size,) + block_tables: torch.Tensor, # (num_seqs, max_num_blocks_per_seq) + cu_seqlen_q: torch.Tensor, # (num_seqs + 1,) + cu_seqlen_k: torch.Tensor, # (num_seqs + 1,) + max_seqlen_q: int, + max_seqlen_k: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + causal: bool = True, +) -> torch.Tensor: + num_query_heads = query.shape[1] + head_dim = value_cache.shape[3] + num_kv_heads = value_cache.shape[2] + block_size = value_cache.shape[1] + num_batch = cu_seqlen_q.shape[0] - 1 + max_num_blocks_per_seq = block_tables.shape[1] + + key = key_cache[block_tables].view(num_batch, + max_num_blocks_per_seq * block_size, + num_kv_heads, head_dim) + + value = value_cache[block_tables].view(num_batch, + max_num_blocks_per_seq * block_size, + num_kv_heads, head_dim) + key = key[:, :max_seqlen_k, :, :] + value = value[:, :max_seqlen_k, :, :] + + seqlen_k = cu_seqlen_k[1:] - cu_seqlen_k[:-1] + seqlen_q = cu_seqlen_q[1:] - cu_seqlen_q[:-1] + seqlen_q = seqlen_q.view(-1, 1) + seqlen_k = seqlen_k.view(-1, 1) + seqlen_diff = seqlen_k - seqlen_q + q_idx_mask = (torch.arange(0, max_seqlen_q, + device="npu").view(1, -1).repeat(num_batch, 1)) + k_idx_mask = (torch.arange(0, max_seqlen_k, + device="npu").view(1, -1).repeat(num_batch, 1)) + q_mask = q_idx_mask < seqlen_q + k_mask = k_idx_mask < seqlen_k + + # calculate idx for causal mask of query [batch, max_seqlen_q] + causal_mask_idx = (q_idx_mask + seqlen_diff)[q_mask] + + # generate causal mask [batch, max_seqlen_q, max_seqlen_k] + tril_mask = torch.tril(torch.ones(max_seqlen_k, max_seqlen_k, + device="npu")) + tril_mask[tril_mask == 0] = float("-inf") + tril_mask[tril_mask == 1] = 0 + causal_mask = tril_mask[causal_mask_idx] + causal_mask_padding = torch.empty([num_batch, max_seqlen_q, max_seqlen_k], + device="npu").fill_(float("-inf")) + causal_mask_padding[q_mask] = causal_mask + # to [batch, num_heads, max_seqlen_q, max_seqlen_k] + causal_mask_padding = causal_mask_padding.unsqueeze(1) + + pad_q = torch.zeros( + [num_batch, max_seqlen_q, num_query_heads, head_dim], + device="npu", + dtype=query.dtype, + ) + pad_k = torch.zeros( + [num_batch, max_seqlen_k, num_kv_heads, head_dim], + device="npu", + dtype=key.dtype, + ) + pad_v = torch.zeros( + [num_batch, max_seqlen_k, num_kv_heads, head_dim], + device="npu", + dtype=value.dtype, + ) + pad_q[q_mask] = query + pad_k[k_mask] = key[k_mask] + pad_v[k_mask] = value[k_mask] + + if num_query_heads > num_kv_heads: + pad_k = pad_k.view( + [num_batch, max_seqlen_k, num_kv_heads, 1, head_dim]) + pad_k = pad_k.repeat(1, 1, 1, num_query_heads // num_kv_heads, 1).view( + [num_batch, max_seqlen_k, num_query_heads, head_dim]) + pad_v = pad_v.view( + [num_batch, max_seqlen_k, num_kv_heads, 1, head_dim]) + pad_v = pad_v.repeat(1, 1, 1, num_query_heads // num_kv_heads, 1).view( + [num_batch, max_seqlen_k, num_query_heads, head_dim]) + # permute to [b, h, n, k] + pad_q = pad_q.permute(0, 2, 1, 3) + pad_k = pad_k.permute(0, 2, 1, 3) + pad_v = pad_v.permute(0, 2, 1, 3) + attn_mask = torch.empty([num_batch, 1, 1, max_seqlen_k], + device="npu").fill_(float("-inf")) + attn_mask[:, :, :, :max_seqlen_k].masked_fill_(k_mask[:, None, None, :], 0) + # [b, h, f, t] + attn_weights = torch.einsum("bhqd,bhkd->bhqk", pad_q, pad_k) + attn_weights *= scale + attn_mask = attn_mask.float() + attn_weights = attn_weights + attn_mask + if causal: + attn_weights = attn_weights + causal_mask_padding + + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_output = torch.einsum("bhqk,bhkd->bhqd", attn_weights, pad_v.float()) + attn_output = attn_output.permute(0, 2, 1, 3) + + attn_output = (attn_output[q_mask].view([-1, num_query_heads, + head_dim]).to(output.dtype)) + output.copy_(attn_output) + return attn_output + + +def vanilla_chunked_prefill_mla( + output: torch.Tensor, # (num_tokens, num_heads, v_head_dim) + query: torch.Tensor, # (num_tokens, num_heads, nope_dim + rope_dim) + kv_cache: Tuple[ + torch.Tensor], # [nope, rope] (num_blocks, block_size, latent_kv) + block_tables: torch.Tensor, # (batch_size, max_num_blocks_per_seq) + query_lens: torch.Tensor, # (batch_size) + context_lens: torch.Tensor, # (batch_size) + kv_b_proj: ColumnParallelLinear, # () + max_query_len: int, + max_context_len: int, + nope_dim: int, + rope_dim: int, + v_head_dim: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + causal: bool = True) -> None: + batch_size = block_tables.size(0) + assert len(kv_cache) > 1 + assert query_lens.size(0) == batch_size + num_heads = query.size(1) + nope_cache = kv_cache[0] + rope_cache = kv_cache[1] + block_size = nope_cache.size(1) + latent_kv_dim = nope_cache.size(-1) + max_num_blocks_per_seq = block_tables.size(1) + batch_size = query_lens.size(0) + nope_cache = nope_cache.squeeze() + # select kv_c out as [batch_size, max_context_len, latent_kv + rope_dim] and get kv_c and k_pe + # cached_kv_c: [batch_size, max_context_len, latent_kv] + # cached_k_pe: [batch_size, max_context_len, rope_dim] + cache_kv_c = nope_cache[block_tables].view( + batch_size, max_num_blocks_per_seq * block_size, + latent_kv_dim)[:, :max_context_len, :] + cache_k_pe = rope_cache[block_tables].view( + batch_size, max_num_blocks_per_seq * block_size, + rope_dim)[:, :max_context_len, :] + # get k_rope and v + # k_nope: [batch_size, max_context_len, num_heads, nope_dim] + # value: [batch_size, max_context_len, num_heads, v_head_dim] + k_nope, value = kv_b_proj(cache_kv_c)[0].view( + batch_size, max_context_len, num_heads, + nope_dim + v_head_dim).split([nope_dim, v_head_dim], dim=-1) + # key: [batch_size, max_context_len, num_hads, rope_dim + nope_dim] + key = torch.cat( + [k_nope, cache_k_pe.unsqueeze(2).expand(-1, -1, num_heads, -1)], + dim=-1) + + context_lens = context_lens.view(-1, 1).to("npu") + query_lens = query_lens.view(-1, 1).to("npu") + seq_diff = context_lens - query_lens + + q_idx_mask = (torch.arange(0, max_query_len, + device="npu").view(1, -1).repeat(batch_size, 1)) + kv_c_idx_mask = (torch.arange(0, max_context_len, + device="npu").view(1, + -1).repeat(batch_size, 1)) + kv_c_mask = kv_c_idx_mask < context_lens + q_mask = q_idx_mask < query_lens + + # calculate idx for causal mask of query [batch, max_seqlen_q] + causal_mask_idx = (q_idx_mask + seq_diff)[q_mask] + + # generate causal mask [batch, max_seqlen_q, max_seqlen_k] + tril_mask = torch.tril( + torch.ones(max_context_len, max_context_len, device="npu")) + tril_mask[tril_mask == 0] = float("-inf") + tril_mask[tril_mask == 1] = 0 + causal_mask = tril_mask[causal_mask_idx] + causal_mask_padding = torch.empty( + [batch_size, max_query_len, max_context_len], + device="npu").fill_(float("-inf")) + causal_mask_padding[q_mask] = causal_mask + # to [batch, num_heads, max_seqlen_q, max_seqlen_k] + causal_mask_padding = causal_mask_padding.unsqueeze(1) + + pad_q = torch.zeros( + [batch_size, max_query_len, num_heads, rope_dim + nope_dim], + device="npu", + dtype=query.dtype, + ) + pad_k = torch.zeros( + [batch_size, max_context_len, num_heads, rope_dim + nope_dim], + device="npu", + dtype=key.dtype, + ) + pad_v = torch.zeros( + [batch_size, max_context_len, num_heads, v_head_dim], + device="npu", + dtype=value.dtype, + ) + num_query = torch.sum(q_mask).item() + num_add_query = num_query - query.size(0) + # mtp will come in + if num_add_query > 0: + add_query_size = query.size() + add_query_size = list(add_query_size) + add_query_size[0] = num_add_query + pad_tensor = torch.zeros(add_query_size, + dtype=query.dtype, + device=query.device) + query = torch.cat([query, pad_tensor], dim=0) + pad_q[q_mask] = query + pad_k[kv_c_mask] = key[kv_c_mask] + pad_v[kv_c_mask] = value[kv_c_mask] + + pad_q = pad_q.permute(0, 2, 1, 3) + pad_k = pad_k.permute(0, 2, 1, 3) + pad_v = pad_v.permute(0, 2, 1, 3) + attn_mask = torch.empty([batch_size, 1, 1, max_context_len], + device="npu").fill_(float("-inf")) + attn_mask[:, :, :, :max_context_len].masked_fill_( + kv_c_mask[:, None, None, :], 0) + # [b, h, f, t] + attn_weights = torch.einsum("bhqd,bhkd->bhqk", pad_q, pad_k) + attn_weights *= scale + attn_mask = attn_mask.float() + attn_weights = attn_weights + attn_mask + if causal: + attn_weights = attn_weights + causal_mask_padding + + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_output = torch.einsum("bhqk,bhkd->bhqd", attn_weights, pad_v.float()) + attn_output = attn_output.permute(0, 2, 1, 3) + + attn_output = (attn_output[q_mask].view([-1, num_heads, + v_head_dim]).to(output.dtype)) + attn_output = attn_output.view_as(output) + output.copy_(attn_output) + return attn_output + + +def vanilla_decode_mla( + query: torch.Tensor, # [num_tokens, num_heads, latent_dim + rope_dim] + key_cache: torch. + Tensor, # [num_blocks, block_size, num_kv_heads, latent_dim + rope_dim] + num_kv_heads: int, + num_heads: int, + scale: float, + block_table: torch.Tensor, # [batch_size, max_block_size] + context_lens: List[int], + mla_vhead_size: int, + rope_dim: int, + output: torch.Tensor): + batch_size = block_table.size()[0] + max_block_size = block_table.size()[1] + reduce_dim = key_cache.size()[-1] + block_size = key_cache.size()[1] + latent_dim = reduce_dim - rope_dim + kv_c_and_pe = key_cache[block_table].view( + [batch_size, max_block_size * block_size, num_kv_heads, reduce_dim]) + max_context_len = max(context_lens) + context_lens = torch.tensor(context_lens, device="npu").view(batch_size, 1) + # [batch_size, max_context_len, num_kv_heads, latent_dim + rope_dim] + # since the kv head is 1 in deepseek, we use expand here for perf + kv_c_and_pe = kv_c_and_pe[:, :max_context_len, :, :].expand( + -1, -1, num_heads, 1) + kv_c = kv_c_and_pe[..., :latent_dim] + kv_idx_mask = (torch.arange(0, max_context_len, + device="npu").view(1, + -1).repeat(batch_size, 1)) + # [batch_size, max_context_len] + kv_idx_mask = kv_idx_mask < context_lens + query = query.unsqueeze(1) + attn_weights = torch.einsum("bqhd,bkhd->bhqk", query, kv_c_and_pe) + attn_weights *= scale + attn_weights = attn_weights + kv_idx_mask[:, -1, -1, :].float() + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_output = torch.einsum("bhqk,bkhd->bqhd", attn_weights, + kv_c.float()).view(-1, num_heads, latent_dim) + output.copy_(attn_output) + return output diff --git a/vllm_ascend/ops/comm_utils.py b/vllm_ascend/ops/comm_utils.py new file mode 100644 index 0000000..e893049 --- /dev/null +++ b/vllm_ascend/ops/comm_utils.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +import torch +import torch.distributed +import torch.distributed as dist +import torch_npu + +COMM_STREAM = None + + +def async_all_to_all(input_, + output_split_sizes, + input_split_sizes, + group, + event=None): + if output_split_sizes is None: + # Equal split (all2all) + a2a_out = torch.empty_like(input_) + else: + # Unequal split (all2all-v) + a2a_out = input_.new_empty( + size=[sum(output_split_sizes)] + list(input_.size()[1:]), + dtype=input_.dtype, + device=torch.npu.current_device(), + ) + + if event: + # multi stream wait event + global COMM_STREAM + if COMM_STREAM is None: + COMM_STREAM = torch_npu.npu.Stream( + device=torch.npu.current_device()) + with torch_npu.npu.stream(COMM_STREAM): + event.wait() + handle = dist.all_to_all_single( + a2a_out, + input_.contiguous(), + output_split_sizes=output_split_sizes, + input_split_sizes=input_split_sizes, + group=group, + async_op=True) + else: + handle = dist.all_to_all_single(a2a_out, + input_.contiguous(), + output_split_sizes=output_split_sizes, + input_split_sizes=input_split_sizes, + group=group, + async_op=True) + return input_, a2a_out, handle diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py new file mode 100644 index 0000000..607991c --- /dev/null +++ b/vllm_ascend/ops/common_fused_moe.py @@ -0,0 +1,531 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Optional + +import torch +import torch_npu +from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.distributed import get_dp_group, get_ep_group, get_tp_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.config import \ + FusedMoEParallelConfig # isort: skip +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, UnquantizedFusedMoEMethod) + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.distributed.moe_comm_method import (AllGatherCommImpl, + AlltoAllCommImpl, + MC2CommImpl) +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \ + setup_token_dispatchers +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p, vllm_version_is + +original_unquantized_fused_moe_init_func = UnquantizedFusedMoEMethod.__init__ + + +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_int8_w8a8: bool = False, + use_int4_w4a8: bool = False, + global_num_experts: Optional[int] = None, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None, + # For TorchAir graph + is_torchair: bool = False, + # For Cube/Vector parallel + shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + # For load balance + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, +) -> torch.Tensor: + # Check constraints + assert hidden_states.shape[1] == w1.shape[1], ( + f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[1]}") + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16 + ] + if (use_int8_w8a8 or use_int4_w4a8): + assert w1_scale is not None and w2_scale is not None, \ + "INT8 quantization requires weight scales." + + w1_scale = w1_scale.to(torch.float32) + down_scale = [w2_scale] + down_output_dtype = w2_scale.dtype + else: + down_scale = None + down_output_dtype = None + + moe_comm_method = get_forward_context().moe_comm_method + assert moe_comm_method is not None, "Missing communication context" + + num_experts = w1.shape[0] + + permuted_hidden_states, expert_tokens, dynamic_scale, group_list_type = moe_comm_method.permute( + hidden_states, topk_ids, topk_weights, expert_map, num_experts, + use_int8_w8a8 or use_int4_w4a8) + + gate_up_output = torch_npu.npu_grouped_matmul( + x=[permuted_hidden_states], + weight=[w1], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=expert_tokens, + output_dtype=torch.int32 if use_int8_w8a8 else None, + )[0] + + if (use_int8_w8a8 or use_int4_w4a8): + activated_output, activated_output_scale = torch_npu.npu_dequant_swiglu_quant( + x=gate_up_output, + weight_scale=w1_scale, + activation_scale=dynamic_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=expert_tokens, + activate_left=True, + quant_mode=1, + ) + activated_output_scale = [activated_output_scale] + else: + activated_output = torch_npu.npu_swiglu(gate_up_output) + activated_output_scale = None + + down_output = torch_npu.npu_grouped_matmul( + x=[activated_output], + weight=[w2], + scale=down_scale, + per_token_scale=activated_output_scale, + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=expert_tokens, + output_dtype=down_output_dtype, + )[0] + + moe_comm_method.unpermute(down_output, hidden_states) + + return hidden_states + + +def fused_experts_moge( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + moe_parallel_config: FusedMoEParallelConfig, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + global_num_experts: int, + expert_map: torch.Tensor = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + """ + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size). + w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size). + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + top_k: Number of experts to select. + expert_map: Expert mapping of shape (num_experts,). + + Returns: + hidden_states: Hidden states after routing. + """ + ep_size = moe_parallel_config.ep_size + local_num_experts = global_num_experts // ep_size + local_num_group = top_k // ep_size + + bsz, _ = hidden_states.shape + flatten_topk_ids = topk_ids.view(-1) + sorted_topk_ids = torch.argsort(flatten_topk_ids.float()) + sorted_topk_ids = sorted_topk_ids.to(torch.int32) + sorted_hidden_states = hidden_states.index_select( + 0, sorted_topk_ids // local_num_group) + + experts_id = torch.arange(0, + local_num_experts, + dtype=topk_ids.dtype, + device=topk_ids.device) + num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to( + torch.float32).sum(0) + topk_scales = topk_weights.view(-1).index_select( + 0, sorted_topk_ids).unsqueeze(-1) + group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64) + + gate_up_out = torch_npu.npu_grouped_matmul( + x=[sorted_hidden_states], + weight=[w1], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + if is_310p(): + gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( + torch.float16) + else: + gate_up_out = torch_npu.npu_swiglu(gate_up_out) + gate_up_out *= topk_scales + + down_out_list = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32) + unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids) + final_hidden_states = unsorted_hidden_states.reshape( + bsz, top_k // ep_size, -1).sum(1) + + return final_hidden_states + + +def unquantized_fused_moe_init_func(self, *args, **kwargs): + original_unquantized_fused_moe_init_func(self, *args, **kwargs) + + # NOTE: Currently, this self.use_aclgraph is only used in + # UnquantizedFusedMoEMethod.forward_oot to decide whether to use in + # ops/fused_moe.py:568 to circumvent torch.randint_like not supported issue. + # Once torch.randint_like is supported or removed, this flag can be removed. + vllm_config = get_current_vllm_config() + ascend_config = get_ascend_config() + if ascend_config.torchair_graph_config.enabled: + self.use_aclgraph = False + else: + self.use_aclgraph = (vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE + and not vllm_config.model_config.enforce_eager) + + +def forward_oot_v01011( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor: + + topk_weights, topk_ids, _ = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=1.0, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + if topk_ids.shape[1] < top_k or is_310p(): + assert global_num_experts is not None + return fused_experts_moge( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + moe_parallel_config=self.moe.moe_parallel_config, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input) + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + + +def forward_oot( + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0, + e_score_correction_bias: Optional[torch.Tensor] = None, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None) -> torch.Tensor: + + topk_weights, topk_ids, _ = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + if topk_ids.shape[1] < top_k or is_310p(): + assert global_num_experts is not None + return fused_experts_moge( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + moe_parallel_config=self.moe.moe_parallel_config, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input) + + return fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) + + +def process_weights_after_loading(self, layer): + super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer) + w13_data = self._maybe_pad_weight(layer.w13_weight.data).transpose( + 1, 2).contiguous() + layer.w13_weight = torch.nn.Parameter(w13_data, requires_grad=False) + + w2_data = self._maybe_pad_weight(layer.w2_weight.data).transpose( + 1, 2).contiguous() + layer.w2_weight = torch.nn.Parameter(w2_data, requires_grad=False) + + if not is_310p(): + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ) + + +class AscendFusedMoE(FusedMoE): + + def __init__( + self, + num_experts, + top_k, + hidden_size, + intermediate_size, + params_dtype=None, + reduce_results=False, + renormalize=True, + use_grouped_topk=False, + num_expert_group=None, + topk_group=None, + quant_config=None, + tp_size=None, + ep_size=None, + dp_size=None, + prefix="", + custom_routing_function=None, + scoring_func="softmax", + routed_scaling_fator: float = 1.0, + e_score_correction_bias=None, + apply_router_weight_on_input=False, + activation="silu", + enable_eplb=False, + num_redundant_experts=0, + has_bias=False, + ): + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + super().__init__( + num_experts, + top_k, + hidden_size, + intermediate_size, + params_dtype, + reduce_results, + renormalize, + use_grouped_topk, + num_expert_group, + topk_group, + quant_config, + tp_size, + ep_size, + dp_size, + prefix, + custom_routing_function, + scoring_func, + e_score_correction_bias, + apply_router_weight_on_input, + activation, + enable_eplb, + num_redundant_experts, + has_bias, + ) + else: + super().__init__( + num_experts, + top_k, + hidden_size, + intermediate_size, + params_dtype, + reduce_results, + renormalize, + use_grouped_topk, + num_expert_group, + topk_group, + quant_config, + tp_size, + ep_size, + dp_size, + prefix, + custom_routing_function, + scoring_func, + routed_scaling_fator, + e_score_correction_bias, + apply_router_weight_on_input, + activation, + enable_eplb, + num_redundant_experts, + has_bias, + ) + + setup_token_dispatchers(self.moe_config.ep_size, + top_k=self.top_k, + num_experts=self.global_num_experts, + num_local_experts=self.local_num_experts) + + self.moe_config.tp_group = get_tp_group() + self.moe_config.dp_group = get_dp_group() + self.moe_config.ep_group = get_ep_group() + self.moe_config.mc2_group = get_mc2_group() + + for method in {AllGatherCommImpl, AlltoAllCommImpl, MC2CommImpl}: + setattr( + self, method.__name__.lower(), + method(moe_config=self.moe_config)) # type: ignore[abstract] + + def forward_impl(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + assert self.quant_method is not None + + forward_context = get_forward_context() + moe_comm_method_name = forward_context.moe_comm_method_name + + # TODO: Can we refactor this logic to model_runner? + # TODO: Adjusted logic to differentiate between A2 and A3, we check ep_size here since mc2 only support ep_size >= 16 on A3 now + if self.moe_config.ep_size < 16: + moe_comm_method_name = "allgathercommimpl" + + forward_context.moe_comm_method = getattr(self, moe_comm_method_name) + + hidden_states, router_logits = forward_context.moe_comm_method.prepare( + hidden_states=hidden_states, router_logits=router_logits) + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + activation=self.activation, + apply_router_weight_on_input=self.apply_router_weight_on_input, + enable_eplb=self.enable_eplb, + expert_load_view=self.expert_load_view, + logical_to_physical_map=self.logical_to_physical_map, + logical_replica_count=self.logical_replica_count, + ) + + final_hidden_states = forward_context.moe_comm_method.finalize( + hidden_states=final_hidden_states, + reduce_results=self.reduce_results) + + return final_hidden_states + + +UnquantizedFusedMoEMethod.__init__ = unquantized_fused_moe_init_func +UnquantizedFusedMoEMethod.process_weights_after_loading = process_weights_after_loading + +if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + UnquantizedFusedMoEMethod.forward_oot = forward_oot_v01011 +else: + UnquantizedFusedMoEMethod.forward_oot = forward_oot diff --git a/vllm_ascend/ops/expert_load_balancer.py b/vllm_ascend/ops/expert_load_balancer.py new file mode 100644 index 0000000..c6eec64 --- /dev/null +++ b/vllm_ascend/ops/expert_load_balancer.py @@ -0,0 +1,99 @@ +import json +import random +from typing import Dict, List + +import torch + + +class ExpertLoadBalancer(object): + + def __init__(self, expert_map_path, global_expert_num): + self.expert_map_path = expert_map_path + self.global_expert_num = global_expert_num + self.expert_map_tensor, self.layers_num, self.ranks_num = ( + self._expert_file_to_tensor()) + + def _expert_file_to_tensor(self): + with open(self.expert_map_path, "r") as f: + data = json.load(f) + layers_num = data["moe_layer_count"] + gpus_num = data["layer_list"][0]["device_count"] + + tensor_data = [] + for layer in data["layer_list"]: + device_data = [] + for device in layer["device_list"]: + device_data.append(device["device_expert"]) + tensor_data.append(device_data) + expert_map_tensor = torch.tensor(tensor_data, dtype=torch.int32) + return expert_map_tensor, layers_num, gpus_num + + def generate_index_dicts(self, tensor_2d): + dict_list = [] + current_idx = 0 + + for row in tensor_2d: + value_to_index = {} + for i in range(row.size(0)): + value = row[i].item() + value_to_index[value] = current_idx + i + dict_list.append(value_to_index) + current_idx += row.size(0) + + return dict_list + + def generate_expert_placement_map(self): + expert_placement_map = torch.full( + (self.layers_num, self.ranks_num, self.global_expert_num), + -1, + dtype=torch.int32, + ) + for layer_id in range(self.layers_num): + for gpu_id in range(self.ranks_num): + e_ids = self.expert_map_tensor[layer_id, gpu_id] + expert_placement_map[layer_id, gpu_id, + e_ids] = torch.arange(len(e_ids), + dtype=torch.int32) + return expert_placement_map + + def generate_log2phy_expert_map(self, layer_id): + concatenated = torch.flatten(self.expert_map_tensor[layer_id]) + rank_expert_to_global = self.generate_index_dicts( + self.expert_map_tensor[layer_id]) + result_dict: Dict[int, List[int]] = {} + for idx, value in enumerate(concatenated): + key = value.item() + if key not in result_dict: + result_dict[key] = [] + result_dict[key].append(idx) + + log2phy_map = torch.full((self.ranks_num, self.global_expert_num), + -1, + dtype=torch.int32) + for rank in range(self.ranks_num): + for key in result_dict: + indices_in_concat = result_dict[key] + if key in rank_expert_to_global[rank]: + log2phy_map[rank][key] = rank_expert_to_global[rank][key] + else: + chosen_index = random.choice(indices_in_concat) + log2phy_map[rank][key] = chosen_index + return log2phy_map + + def get_rank_placement_map(self, layer_id, rank_id): + expert_placement_map = self.generate_expert_placement_map() + layer_expert_map = expert_placement_map[layer_id] + rank_expert_map = layer_expert_map[rank_id].to( + torch.npu.current_device()) + rank_local_expert_num = torch.sum(torch.ne(rank_expert_map, -1)).item() + return rank_local_expert_num, rank_expert_map + + def get_rank_log2phy_map(self, layer_id, rank_id): + layer_log2phy_map = self.generate_log2phy_expert_map(layer_id) + return layer_log2phy_map[rank_id] + + def get_global_redundant_expert_num(self): + global_redundant_expert_num = ( + len(self.expert_map_tensor[0][0]) * self.ranks_num - + self.global_expert_num) + return global_redundant_expert_num diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py new file mode 100644 index 0000000..14396c1 --- /dev/null +++ b/vllm_ascend/ops/fused_moe.py @@ -0,0 +1,587 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/kernels/test_moe.py + +import os +from typing import Any, Callable, Optional + +import torch +import torch.distributed as dist +import torch_npu +from torch import nn +from vllm.config import get_current_vllm_config +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.config import \ + FusedMoEConfig # isort: skip +from vllm.model_executor.layers.fused_moe.config import \ + FusedMoEParallelConfig # isort: skip +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) +from vllm.model_executor.layers.quantization.base_config import \ + QuantizationConfig + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.communication_op import \ + data_parallel_reduce_scatter +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp +from vllm_ascend.ops.sequence_parallel import MetadataForPadding +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, dispose_tensor, + get_all_reduce_merge_state, + get_rm_router_logits_state, is_310p) + + +def unified_fused_experts_eager(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + w1_scale: Optional[torch.Tensor] = None, + w1_scale_bias: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w2_scale_bias: Optional[torch.Tensor] = None, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[Any] = None, + shared_dequant_scale: Optional[Any] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + token_dispatcher = get_forward_context().token_dispatcher + + results = token_dispatcher.token_dispatch( + hidden_states=hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=row_idx, + expert_map=expert_map, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + shared_gate_up=shared_gate_up, + shared_dequant_scale=shared_dequant_scale, + mc2_mask=mc2_mask, + apply_router_weight_on_input=apply_router_weight_on_input, + with_quant=with_quant) + + expert_output = unified_apply_mlp( + hidden_states=results["hidden_states"], + w1=w1, + w1_scale=w1_scale, + w2=w2, + w2_scale=w2_scale, + group_list=results["group_list"], + dynamic_scale=results.get("dynamic_scale"), + group_list_type=results.get("group_list_type"), + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias, + topk_scales=results.get("topk_scales"), + with_quant=with_quant) + final_hidden_states = token_dispatcher.token_combine(expert_output) + return final_hidden_states + + +class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): + + def __init__(self, moe: FusedMoEConfig = None): + + super().__init__(moe=moe) + vllm_config = get_current_vllm_config() + + self.global_batch_size = vllm_config.scheduler_config.max_num_seqs + self.max_model_len = vllm_config.model_config.max_model_len + get_ascend_config() + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = None + + def process_weights_after_loading(self, layer): + super(UnquantizedFusedMoEMethod, + self).process_weights_after_loading(layer) + layer.w13_weight = torch.nn.Parameter(self._maybe_pad_weight( + layer.w13_weight.data), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight( + layer.w2_weight.data), + requires_grad=False) + if not is_310p(): + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = False, + enable_force_load_balance: bool = False, + shared_experts: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + + topk_weights, topk_ids, row_idx = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance and not self.use_aclgraph: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + return unified_fused_experts_eager(hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=row_idx, + expert_map=expert_map, + shared_experts=shared_experts, + mc2_mask=kwargs.get( + "mc2_mask", None), + with_quant=False) + + +class AscendFusedMoE(FusedMoE): + + # The moe_counter parameter is required during the initialization of EPLB + # to identify the current layer index within the MOE model. + moe_counter = -1 + + def __init__( + self, + num_experts: int, # Global number of experts + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, + ep_size: Optional[int] = None, + dp_size: Optional[int] = None, + prefix: str = "", + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + ): + # TODO: This could not initialize FusedMoE baseclass, + # fixme and make __init__() of AscendFusedMoE more clear + super().__init__( + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=params_dtype, + reduce_results=reduce_results, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + quant_config=quant_config, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + prefix=prefix, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + ) + AscendFusedMoE.moe_counter += 1 + self.moe_instance_id = AscendFusedMoE.moe_counter + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + vllm_config = get_current_vllm_config() + + self.moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size + if dp_size is not None else get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config) + + self.top_k = top_k + self.num_experts = num_experts + self.global_num_experts = num_experts + assert intermediate_size % self.tp_size == 0 + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.reduce_results = reduce_results + self.renormalize = renormalize + self.use_grouped_topk = use_grouped_topk + if self.use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.custom_routing_function = custom_routing_function + self.scoring_func = scoring_func + self.e_score_correction_bias = e_score_correction_bias + self.expert_map = None + self.activation = activation + self.log2phy = None + self.global_redundant_expert_num = 0 + + is_deepseek_v3_r1 = self.global_num_experts == 256 + self.rm_router_logits = get_rm_router_logits_state( + self.moe_parallel_config.ep_size, self.dp_size, is_deepseek_v3_r1) + self.all_reduce_merge = get_all_reduce_merge_state( + self.moe_parallel_config.ep_size, is_deepseek_v3_r1) + + ascend_config = get_ascend_config() + expert_map_path = ascend_config.expert_map_path + if expert_map_path and os.path.exists(expert_map_path): + # moe expert load balance + expert_load_balancer = ExpertLoadBalancer(expert_map_path, + self.global_num_experts) + self.local_num_experts, self.expert_map = \ + expert_load_balancer.get_rank_placement_map( + self.moe_instance_id, + get_ep_group().rank_in_group) + self.log2phy = expert_load_balancer.get_rank_log2phy_map( + self.moe_instance_id, + get_ep_group().rank_in_group) + self.global_redundant_expert_num = \ + expert_load_balancer.get_global_redundant_expert_num() + else: + # Create a tensor of size num_experts filled with -1 + self.local_num_experts, self.expert_map = determine_expert_map( + self.ep_size, + get_ep_group().rank_in_group, self.global_num_experts) + + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + if self.scoring_func != "softmax" and not self.use_grouped_topk: + raise ValueError("Only softmax scoring function is supported for " + "non-grouped topk.") + moe = FusedMoEConfig.make( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + # TODO (bnell): this needs to be fixed for quantized types. + in_dtype=params_dtype, + quant_config=quant_config) + + self.moe_config = moe + + if quant_config is None: + self.quant_method = AscendUnquantizedFusedMoEMethod(moe) + else: + self.quant_method = quant_config.get_quant_method(self, prefix) + + assert self.quant_method is not None + + local_num_experts = torch.sum(self.expert_map != -1) \ + if self.expert_map is not None else num_experts + + moe_quant_params = { + "num_experts": local_num_experts, + "hidden_size": hidden_size, + "intermediate_size_per_partition": + self.intermediate_size_per_partition, + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + } + # need full intermediate size pre-sharding for WNA16 act order + if (self.quant_method.__class__.__name__ + in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")): + moe_quant_params["intermediate_size_full"] = intermediate_size + + self.ep_group = get_ep_group() + # NOTE: self.tp_group is not expert_tp_group + self.tp_group = get_tp_group().device_group + self.quant_method.create_weights(layer=self, **moe_quant_params) + self.token_dispatcher = None + + ep_size = (get_ep_group().world_size if + vllm_config.parallel_config.enable_expert_parallel else 1) + from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \ + setup_token_dispatchers + setup_token_dispatchers( + ep_size, + top_k=self.top_k, + num_experts=self.global_num_experts, + num_global_redundant_experts=self.global_redundant_expert_num, + num_local_experts=self.local_num_experts) + + def naive_multicast(self, x: torch.Tensor, + cu_tokens_across_dp_cpu: torch.Tensor): + assert (len(x.shape) == 2) + buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)), + device=x.device, + dtype=x.dtype) + start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ + self.dp_rank - 1] + end = cu_tokens_across_dp_cpu[self.dp_rank] + buffer[start:end, :].copy_(x) + for idx in range(self.dp_size): + start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1] + end = cu_tokens_across_dp_cpu[idx] + get_dp_group().broadcast(buffer[start:end, :], idx) + return buffer + + def forward(self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_prefill: bool, + enable_force_load_balance: bool = False, + top_k: Optional[int] = None, + shared_experts: Optional[Any] = None, + gate=None, + replace_allreduce: bool = False, + _metadata_for_padding: Optional[MetadataForPadding] = None): + + assert self.quant_method is not None + + if top_k: + real_top_k = top_k + else: + real_top_k = self.top_k + + num_tokens, hidden_size = hidden_states.shape + + forward_context = get_forward_context() + fused_moe_state = forward_context.fused_moe_state + mc2_mask = forward_context.mc2_mask + # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel. + quantized_x_for_share, dynamic_scale_for_share = None, None + + if shared_experts: + # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce + shared_hidden_states = shared_experts(hidden_states) + + mc2_mask = forward_context.mc2_mask + + enable_sp = _metadata_for_padding is not None and _metadata_for_padding.not_dummy_and_is_prefill + tp_size = get_tensor_model_parallel_world_size() + if enable_sp: + tp_rank = get_tensor_model_parallel_rank() + mc2_mask_sp = _metadata_for_padding.mc2_mask if _metadata_for_padding is not None else forward_context.mc2_mask + chunk_mc2_mask = torch.tensor_split(mc2_mask_sp, tp_size, dim=0) + mc2_mask = chunk_mc2_mask[tp_rank] + replace_allreduce = True + + if (fused_moe_state not in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ] and not replace_allreduce): + if fused_moe_state in {FusedMoEState.MC2}: + padding_size = forward_context.padded_num_tokens + else: + # TODO: Determine if we can remove the padding + padding_size = tp_size + if num_tokens < padding_size and not self.enable_shared_expert_dp: + hidden_states = nn.functional.pad( + hidden_states, (0, 0, 0, padding_size - num_tokens)) + router_logits = nn.functional.pad( + router_logits, (0, 0, 0, padding_size - num_tokens)) + if tp_size > 1: + tp_rank = get_tensor_model_parallel_rank() + if not self.enable_shared_expert_dp: + chunk_hidden_states = torch.tensor_split(hidden_states, + tp_size, + dim=0) + chunk_router_logits = torch.tensor_split(router_logits, + tp_size, + dim=0) + hidden_states = chunk_hidden_states[tp_rank] + router_logits = chunk_router_logits[tp_rank] + + chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0) + mc2_mask = chunk_mc2_mask[tp_rank] + + if self.dp_size > 1: + if fused_moe_state == FusedMoEState.AllGather: + # NOTE: When in torchair graph, it has been padded in model_runner_v1 + max_tokens_across_dp = forward_context.max_tokens_across_dp + if num_tokens < max_tokens_across_dp: + hidden_states = nn.functional.pad( + hidden_states, + (0, 0, 0, max_tokens_across_dp - num_tokens)) + if not self.rm_router_logits: + router_logits = nn.functional.pad( + router_logits, + (0, 0, 0, max_tokens_across_dp - num_tokens)) + hidden_states = get_dp_group().all_gather(hidden_states, 0) + if self.rm_router_logits: + router_logits, _ = gate(hidden_states) + else: + router_logits = get_dp_group().all_gather(router_logits, 0) + + elif fused_moe_state == FusedMoEState.NaiveMulticast: + cu_tokens_across_dp_cpu = get_forward_context( + ).dp_metadata.cu_tokens_across_dp_cpu + hidden_states = self.naive_multicast(hidden_states, + cu_tokens_across_dp_cpu) + if self.rm_router_logits: + router_logits, _ = gate(hidden_states) + else: + router_logits = self.naive_multicast( + router_logits, cu_tokens_across_dp_cpu) + + # Matrix multiply. + e_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=real_top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + is_prefill=is_prefill, + enable_force_load_balance=enable_force_load_balance, + log2phy=self.log2phy, + global_redundant_expert_num=self.global_redundant_expert_num, + shared_experts=None, + mc2_mask=mc2_mask, + token_dispatcher=self.token_dispatcher, + quantized_x_for_share=quantized_x_for_share, + dynamic_scale_for_share=dynamic_scale_for_share, + ) + + if shared_experts: + if isinstance(e_hidden_states, tuple): + e_hidden_states, shared_hidden_states = e_hidden_states + + if (fused_moe_state not in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ] and not replace_allreduce and not self.enable_shared_expert_dp): + if tp_size > 1: + dist.all_gather(list(chunk_hidden_states), e_hidden_states, + self.tp_group) + final_hidden_states = torch.cat(chunk_hidden_states, dim=0) + dispose_tensor(e_hidden_states) + else: + final_hidden_states = e_hidden_states + if num_tokens < padding_size: + final_hidden_states = final_hidden_states[:num_tokens] + elif self.dp_size > 1 and not self.enable_shared_expert_dp: + if fused_moe_state == FusedMoEState.NaiveMulticast: + start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ + self.dp_rank - 1] + end = cu_tokens_across_dp_cpu[self.dp_rank] + final_hidden_states = get_dp_group().all_reduce( + e_hidden_states) + final_hidden_states = final_hidden_states[start:end, :] + dispose_tensor(e_hidden_states) + elif fused_moe_state == FusedMoEState.AllGather: + final_hidden_states = data_parallel_reduce_scatter( + e_hidden_states, dim=0) + final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) + else: + final_hidden_states = e_hidden_states + else: + final_hidden_states = e_hidden_states + + if tp_size > 1 and not self.all_reduce_merge and fused_moe_state in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ]: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + if shared_experts: + return final_hidden_states, shared_hidden_states + else: + return final_hidden_states + + # ----------------------------------------- TBO-related -------------------------------------------- + + def _forward_ms_fused_moe_comp( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_prefill: bool, + real_top_k, + enable_force_load_balance: bool = False, + ): + hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=real_top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + is_prefill=is_prefill, + enable_force_load_balance=enable_force_load_balance, + ) + + return hidden_states diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py new file mode 100644 index 0000000..4f0b550 --- /dev/null +++ b/vllm_ascend/ops/layernorm.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from typing import Optional, Tuple, Union + +import torch +from vllm.model_executor.layers.layernorm import RMSNorm + + +class AddRMSNormW8A8Quant(RMSNorm): + # Fuse AddRmsNorm and W8A8 quantization ops together + + def __init__( + self, + hidden_size: int, + layer: torch.nn.Module, + eps: float = 1e-6, + var_hidden_size: Optional[int] = None, + has_weight: bool = True, + dtype: Optional[torch.dtype] = None, + ) -> None: + super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype) + self.layer = layer + + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + import torch_npu + + if residual is not None: + x, _, residual = torch_npu.npu_add_rms_norm_quant( + x, + residual, + self.weight, + self.layer.aclnn_input_scale, + self.layer.aclnn_input_offset, + epsilon=self.variance_epsilon) + return x, residual + + x, residual = torch_npu.npu_rms_norm(x, self.weight, + self.variance_epsilon) + return x + + +class AscendRMSNorm(RMSNorm): + + def forward_oot( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + import torch_npu + + from vllm_ascend.utils import is_310p + if residual is not None: + if is_310p(): + orig_dtype = residual.dtype + x = x + residual.to(x.dtype) + residual = x.to(orig_dtype) + x, _ = torch_npu.npu_rms_norm(x, self.weight, + self.variance_epsilon) + else: + x, _, residual = torch_npu.npu_add_rms_norm( + x, residual, self.weight, self.variance_epsilon) + return x, residual + + x, residual = torch_npu.npu_rms_norm(x, self.weight, + self.variance_epsilon) + return x diff --git a/vllm_ascend/ops/layers/__init__.py b/vllm_ascend/ops/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/ops/layers/experts_selector.py b/vllm_ascend/ops/layers/experts_selector.py new file mode 100644 index 0000000..eace164 --- /dev/null +++ b/vllm_ascend/ops/layers/experts_selector.py @@ -0,0 +1,283 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from typing import Callable, Optional + +import torch +import torch_npu + + +def return_row_idx(hidden_states, top_k): + num_tokens = hidden_states.shape[0] + row_idx_len = num_tokens * top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=hidden_states.device).view( + top_k, -1).permute(1, 0).contiguous()) + return row_idx + + +def select_experts(hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + routed_scaling_factor=1.0, + e_score_correction_bias: Optional[torch.Tensor] = None, + indices_type: Optional[torch.dtype] = None, + global_num_experts: int = -1): + """ + Fused experts with select experts. + + Args: + router_logits: router logits of shape (num_tokens, hidden_size). + hidden_states: Hidden states of shape (num_tokens, hidden_size). + top_k: number of top k experts. + use_grouped_topk: Whether to group experts before selecting top-k. + renormalize: Whether to renormalize the routing weights. + topk_group: Number of expert groups to select from. + num_expert_group: Number of experts in each group. + custom_routing_function: Custom routing function. + scoring_func: Scoring function to use. + e_score_correction_bias: Correction bias to apply to expert scores. + indices_type: dtype of indices + global_num_experts: Global number of experts. + + Returns: + topk_weights: router weights of shape (num_tokens, top_k). + topk_ids: selected expert IDs of shape (num_tokens, top_k). + """ + + topk_weights, topk_ids, row_idx = _select_experts_with_fusion_ops( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + topk_group=topk_group, + renormalize=renormalize, + e_score_correction_bias=e_score_correction_bias, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + routed_scaling_factor=routed_scaling_factor, + global_num_experts=global_num_experts) + + if topk_weights is None: + topk_weights, topk_ids = _native_select_experts( + hidden_states=hidden_states, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts, + ) + if row_idx is None: + row_idx = return_row_idx(hidden_states, top_k) + return topk_weights, topk_ids, row_idx + + +def _native_grouped_topk( + topk_weights: torch.Tensor, + num_expert_group: Optional[int], + topk_group: Optional[int], +): + topk_group = 0 if topk_group is None else topk_group + num_expert_group = 0 if num_expert_group is None else num_expert_group + + num_token = topk_weights.shape[0] + grouped_weights = topk_weights.view(num_token, num_expert_group, + -1).max(dim=-1).values + topk_group_indices = torch.topk(grouped_weights.to(torch.float32), + k=topk_group, + dim=-1, + sorted=False)[1] + topk_group_mask = torch.zeros_like(grouped_weights) + topk_group_mask.scatter_(1, topk_group_indices, 1) + topk_weight_mask = (topk_group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + topk_weights.shape[-1] // num_expert_group).reshape(num_token, -1)) + topk_weights = topk_weights.masked_fill(~topk_weight_mask.bool(), 0.0) + + return topk_weights + + +def _renormalize_topk_weights( + topk_weights: torch.Tensor, + renormalize: bool, +): + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True) + return topk_weights + + +def _select_expert_use_group_topk( + topk_weights: torch.Tensor, topk_group: Optional[int], + renormalize: bool, top_k: int, num_expert_group: Optional[int], + e_score_correction_bias: Optional[torch.Tensor]): + assert topk_group is not None + assert num_expert_group is not None + + if e_score_correction_bias is not None: + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_weights = topk_weights + topk_weights = topk_weights + e_score_correction_bias.unsqueeze(0) + + # TODO: Change to npu_group_topk when the latest CANN and NNAL is available + # >>> torch_npu._npu_group_topk(topk_weights, group_num=num_expert_group, k=topk_group) + topk_weights = _native_grouped_topk(topk_weights, num_expert_group, + topk_group) + # TODO bfloat16 is not supported in torch.topk with ge graph. + if e_score_correction_bias is not None: + topk_ids = torch.topk(topk_weights.to(torch.float32), + k=top_k, + dim=-1, + sorted=False)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_weights.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(topk_weights.to(torch.float32), + k=top_k, + dim=-1, + sorted=False) + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + return topk_weights, topk_ids + + +def _select_experts_with_fusion_ops( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + e_score_correction_bias: Optional[torch.Tensor], + topk_group: Optional[int], + num_expert_group: Optional[int], + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + routed_scaling_factor=1.0, + global_num_experts: int = -1): + + topk_weights, topk_ids, row_idx = None, None, None + # NOTE: now npu_moe_gating_top_k can only support 'group_count=256' pattern + is_deepseek_v3_r1 = global_num_experts == 256 + if is_deepseek_v3_r1: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( + router_logits, + k=top_k, # topk currently 8 + bias=e_score_correction_bias, + k_group=topk_group, # fix: 4 + group_count=num_expert_group, # fix 8 + group_select_mode= + 1, # 0: the maximum in the group; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; should the third output be output + # y2_flag=False, # old api; should the third output be output + routed_scaling_factor=1, + eps=float(1e-20)) + row_idx = return_row_idx(hidden_states, top_k) + if not use_grouped_topk and custom_routing_function is None and scoring_func == "softmax": + topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax( + x=router_logits, finished=None, k=top_k) + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + + return topk_weights, topk_ids, row_idx + + +def _native_select_experts( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + global_num_experts: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Select top-k experts based on router logits. + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + router_logits: Router logits of shape (num_tokens, num_experts). + top_k: Number of experts to select. + use_grouped_topk: Whether to group experts before selecting top-k. + renormalize: Whether to renormalize the routing weights. + topk_group: Number of expert groups to select from. + num_expert_group: Number of experts in each group. + custom_routing_function: Custom routing function. + scoring_func: Scoring function to use. + e_score_correction_bias: Correction bias to apply to expert scores. + + Returns: + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + + Raises: + ValueError: If an unsupported scoring function is provided. + """ + + if scoring_func == "softmax": + topk_weights = router_logits.softmax(dim=-1) + elif scoring_func == "sigmoid": + topk_weights = router_logits.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + if use_grouped_topk: + return _select_expert_use_group_topk( + topk_weights=topk_weights, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + e_score_correction_bias=e_score_correction_bias) + + if custom_routing_function is not None: + topk_weights, topk_ids = custom_routing_function( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + global_num_experts=global_num_experts) + # Required by npu_moe_init_routing + topk_ids = topk_ids.to(torch.int32) + return topk_weights, topk_ids + + topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1) + topk_weights = topk_weights.to(hidden_states.dtype) + + # Required by npu_moe_init_routing + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + + return topk_weights, topk_ids diff --git a/vllm_ascend/ops/layers/moe_mlp.py b/vllm_ascend/ops/layers/moe_mlp.py new file mode 100644 index 0000000..c73e8ea --- /dev/null +++ b/vllm_ascend/ops/layers/moe_mlp.py @@ -0,0 +1,199 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from typing import Optional + +import torch +import torch_npu +from vllm.forward_context import get_forward_context + +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.utils import dispose_tensor, is_310p + + +def quant_apply_mlp(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None) -> torch.Tensor: + if dynamic_scale is None: + unquantized_hidden_states = hidden_states + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + # Dispose the original unquantized hidden states + # to save npu memory because they're no longer used. + dispose_tensor(unquantized_hidden_states) + else: + pertoken_scale = dynamic_scale + + bias1, bias2 = None, None + _output_dtype = w2_scale.dtype + + is_mc2 = get_forward_context().fused_moe_state == FusedMoEState.MC2 + if w1_scale_bias is None and is_mc2: + w1_scale = w1_scale.to(torch.float32) + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=3, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=torch.int32)[0] + + # act_fn: swiglu + hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=w1_scale, + activation_scale=pertoken_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=group_list, + activate_left=True, + quant_mode=1, + ) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=w2_scale.dtype)[0] + else: + if w1_scale_bias is not None: + if group_list_type == 0: + group_list = torch.cat( + [group_list[:1], + torch.diff(group_list, dim=0)]) + group_list_type = 1 + bias1 = [w1_scale_bias] + bias2 = [w2_scale_bias] + # TODO w4a8 scene: dynamic acquisition of dtype in the future + _output_dtype = torch.bfloat16 + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + scale=[w1_scale], + bias=bias1, + per_token_scale=[pertoken_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + + # act_fn: swiglu + hidden_states = torch_npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant( + hidden_states) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + bias=bias2, + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + return hidden_states + + +def unquant_apply_mlp( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + group_list: torch.Tensor, + group_list_type: int = 1, + topk_scales: Optional[torch.Tensor] = None) -> torch.Tensor: + w1 = w1.transpose(1, 2) + gate_up_out = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + if is_310p(): + gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( + torch.float16) + else: + gate_up_out = torch_npu.npu_swiglu(gate_up_out) + + if topk_scales is not None: + gate_up_out *= topk_scales + + w2 = w2.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + return hidden_states + + +def unified_apply_mlp(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None, + topk_scales: Optional[torch.Tensor] = None, + with_quant: bool = False) -> torch.Tensor: + if with_quant: + return quant_apply_mlp(hidden_states=hidden_states, + w1=w1, + w1_scale=w1_scale, + w2=w2, + w2_scale=w2_scale, + group_list=group_list, + dynamic_scale=dynamic_scale, + group_list_type=group_list_type, + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias) + else: + return unquant_apply_mlp(hidden_states=hidden_states, + w1=w1, + w2=w2, + group_list=group_list, + group_list_type=group_list_type, + topk_scales=topk_scales) diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py new file mode 100644 index 0000000..e2f427e --- /dev/null +++ b/vllm_ascend/ops/linear.py @@ -0,0 +1,309 @@ +""" +Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +This file is a part of the vllm-ascend project. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Optional, Union + +import torch +from torch.nn.parameter import Parameter +from vllm.distributed import (divide, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce) +from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED, + ColumnParallelLinear, + LinearBase, + MergedColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import \ + QuantizationConfig +from vllm.model_executor.utils import set_weight_attrs + +from vllm_ascend.distributed.parallel_state import ( + get_mlp_tensor_model_parallel_rank, + get_mlp_tensor_model_parallel_world_size, get_mlp_tp_group) + + +class AscendMlpColumnParallelLinear(ColumnParallelLinear): + """Linear layer with column parallelism. + + Use the MLP tensor parallelism group in the MLP module, + and the original TP group in other modules. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + output_sizes: Optional[list[int]] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + # Divide the weight matrix along the last dimension. + if prefix.find("gate_up_proj") != -1: + self.tp_size = get_mlp_tensor_model_parallel_world_size() + self.tp_rank = get_mlp_tensor_model_parallel_rank() + self.enable_mlp_optimze = True + else: + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.enable_mlp_optimze = False + self.input_size_per_partition = input_size + self.output_size_per_partition = divide(output_size, self.tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, self.tp_size) + for output_size in self.output_sizes + ] + LinearBase.__init__(self, + input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.gather_output = gather_output + + if output_sizes is None: + output_sizes = [output_size] + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + +class AscendMlpRowParallelLinear(RowParallelLinear): + """Linear layer with row parallelism. + Use the MLP tensor parallelism group in the MLP module, + and the original TP group in other modules. + """ + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + if prefix.find("down_proj") != -1: + self.tp_size = get_mlp_tensor_model_parallel_world_size() + self.tp_rank = get_mlp_tensor_model_parallel_rank() + self.enable_mlp_optimze = True + else: + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.enable_mlp_optimze = False + # Divide the weight matrix along the first dimension. + self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + + LinearBase.__init__(self, + input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=( + self.weight_loader_v2 if self.quant_method.__class__.__name__ + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def forward( + self, + input_, + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + if self.enable_mlp_optimze: + tp_rank = get_mlp_tensor_model_parallel_rank() + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_mlp_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 + or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + output = get_mlp_tp_group().reduce_scatter(output_parallel, 0) + # output = output[:num_tokens,:] + # dispose_tensor(output_parallel) + else: + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 + or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class AscendMlpMergedColumnParallelLinear(MergedColumnParallelLinear): + """Packed linear layers with column parallelism. + + Similar to ColumnParallelLinear, but the weight matrix is concatenated + along the output dimension. When the weight matrix is loaded, the + different partitions are sharded separately. + + Use the MLP tensor parallelism group in the MLP module, + and the original TP group in other modules. + """ + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + self.output_sizes = output_sizes + if prefix.find("gate_up_proj") != -1: + self.tp_size = get_mlp_tensor_model_parallel_world_size() + self.tp_rank = get_mlp_tensor_model_parallel_rank() + self.enable_mlp_optimze = True + else: + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.enable_mlp_optimze = False + assert all(output_size % self.tp_size == 0 + for output_size in output_sizes) + AscendMlpColumnParallelLinear.__init__(self, + input_size=input_size, + output_size=sum(output_sizes), + bias=bias, + gather_output=gather_output, + skip_bias_add=skip_bias_add, + params_dtype=params_dtype, + quant_config=quant_config, + prefix=prefix, + return_bias=return_bias) + + def forward( + self, + input_, + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + bias = self.bias if not self.skip_bias_add else None + # self.global_batch_size = vllm_config.scheduler_config.max_num_seqs + # Matrix multiply. + assert self.quant_method is not None + if self.enable_mlp_optimze: + input2_ = get_mlp_tp_group().all_gather(input_, 0) + output = self.quant_method.apply(self, input2_, bias) + else: + output_parallel = self.quant_method.apply(self, input_, bias) + if self.gather_output: + # All-gather across the partitions. + output = tensor_model_parallel_all_gather(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output + return output, output_bias diff --git a/vllm_ascend/ops/moe_dispatcher/__init__.py b/vllm_ascend/ops/moe_dispatcher/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py new file mode 100644 index 0000000..855faad --- /dev/null +++ b/vllm_ascend/ops/moe_dispatcher/token_dispatcher.py @@ -0,0 +1,809 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2024; NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional + +import torch +import torch_npu +from vllm.distributed.parallel_state import get_ep_group + +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.distributed.tensor_parallel import \ + gather_from_sequence_parallel_region +from vllm_ascend.ops.comm_utils import async_all_to_all +from vllm_ascend.utils import AscendSocVersion, get_ascend_soc_version + +_Dispatchers: Dict[str, Any] = {} + + +def _register_token_dispatcher(dispatcher: Any): + _Dispatchers[dispatcher.__class__.__name__] = dispatcher + + +def get_token_dispatcher(name: str): + return _Dispatchers.get(name) + + +def setup_token_dispatchers(ep_size: int, **kwargs): + existing_dispatchers = set(_Dispatchers.keys()) + + if ep_size == 1 and "TokenDispatcherWithAllGather" not in existing_dispatchers: + _register_token_dispatcher(TokenDispatcherWithAllGather(**kwargs)) + elif ep_size < 16 and "TokenDispatcherWithAll2AllV" not in existing_dispatchers: + _register_token_dispatcher(TokenDispatcherWithAll2AllV(**kwargs)) + elif ep_size >= 16: + if "TokenDispatcherWithAll2AllV" not in existing_dispatchers: + _register_token_dispatcher(TokenDispatcherWithAll2AllV(**kwargs)) + if "TokenDispatcherWithMC2" not in existing_dispatchers: + _register_token_dispatcher(TokenDispatcherWithMC2(**kwargs)) + + +class MoETokenDispatcher(ABC): + + def __init__(self, **kwargs) -> None: + """ + Initialize the MoE Token Dispatcher. + """ + self.top_k = kwargs.get("top_k", 0) + self.num_experts = kwargs.get("num_experts", 0) + + @property + def ep_group(self): + """Get expert model parallel group.""" + return get_ep_group().device_group + + @property + def ep_rank(self): + return get_ep_group().rank_in_group + + @property + def ep_size(self): + return get_ep_group().world_size + + @abstractmethod + def token_dispatch(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[torch.Tensor] = None, + shared_dequant_scale: Optional[torch.Tensor] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + raise NotImplementedError("Dispatch function not implemented.") + + @abstractmethod + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + raise NotImplementedError("Combine function not implemented.") + + +class TokenDispatcherWithMC2(MoETokenDispatcher): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name(local_rank) + self.ep_rank_id = get_mc2_group().rank_in_group + self.ep_world_size = get_mc2_group().world_size + self.enable_dispatch_v2 = hasattr(torch_npu, + "npu_moe_distribute_dispatch_v2") + self.need_extra_args = ( + get_ascend_soc_version() == AscendSocVersion.A3) + + # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine + self.a3_need_extra_args = \ + get_ascend_soc_version() == AscendSocVersion.A3 + self.output = None + self.assist_info_for_combine = None + self.ep_recv_counts = None + self.shared_act = None + self.topk_ids = None + self.topk_weights = None + self.shared_experts = None + self.mc2_mask = None + self.with_quant = False + + def get_dispatch_mc2_kwargs( + self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + expert_map: torch.Tensor, + global_redundant_expert_num: int = 0, + ): + if self.with_quant: + quant_mode = 2 + if (expert_map is not None): + moe_expert_num = len(expert_map) + global_redundant_expert_num + else: + moe_expert_num = global_redundant_expert_num + else: + quant_mode = 0 + moe_expert_num = len(expert_map) + kwargs_mc2 = { + "x": hidden_states, + "expert_ids": topk_ids, + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + + stage1_kwargs = { + "scales": None, + "quant_mode": quant_mode, + "group_ep": self.moe_all_to_all_group_name, + "ep_world_size": self.ep_world_size, + "ep_rank_id": self.ep_rank_id, + } + if self.need_extra_args: + stage1_kwargs.update({ + "group_tp": self.moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if self.a3_need_extra_args and self.enable_dispatch_v2: + stage1_kwargs.update({ + "x_active_mask": self.mc2_mask, + }) + + kwargs_mc2.update(stage1_kwargs) + return kwargs_mc2 + + def token_dispatch(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[torch.Tensor] = None, + shared_dequant_scale: Optional[torch.Tensor] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + self.with_quant = with_quant + self.expert_map = expert_map + self.topk_ids = topk_ids + self.topk_weights = topk_weights + self.shared_experts = shared_experts + self.mc2_mask = mc2_mask + + kwargs_mc2 = self.get_dispatch_mc2_kwargs(hidden_states, topk_weights, + topk_ids, expert_map, + global_redundant_expert_num) + self.output = torch_npu.npu_moe_distribute_dispatch_v2( + **kwargs_mc2 + ) if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_dispatch( + **kwargs_mc2) + # comm_stream.wait_stream(torch.npu.current_stream()) + expand_x, dynamic_scale, self.assist_info_for_combine, \ + expert_token_nums, self.ep_recv_counts = self.output[0:5] + + if self.with_quant: + if shared_experts is not None: + shared_act_out = shared_experts.act_fn( + (shared_gate_up, shared_dequant_scale)) + self.shared_act, self.swiglu_out_scale = \ + shared_act_out[0], shared_act_out[1] + + else: + if shared_experts is not None: + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + self.shared_act = shared_experts.act_fn(shared_gate_up) + group_list_type = 1 + return { + "group_list_type": group_list_type, + "hidden_states": expand_x, + "group_list": expert_token_nums, + "dynamic_scale": dynamic_scale, + } + + def get_combine_mc_kwargs(self, hidden_states: torch.Tensor): + assert self.expert_map is not None + assert self.topk_weights is not None + assert self.topk_ids is not None + assert self.output is not None + moe_expert_num = len(self.expert_map) + # moeCombine + kwargs_mc2 = { + "expand_x": hidden_states, + "expert_ids": self.topk_ids, + "expert_scales": self.topk_weights.to(torch.float32), + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + if self.with_quant: + tp_recv_counts = torch.empty(1, + dtype=torch.int32, + device=hidden_states.device) + else: + tp_recv_counts = self.output[5] + stage3_kwargs = { + "ep_send_counts": self.ep_recv_counts, + "group_ep": self.moe_all_to_all_group_name, + "ep_world_size": self.ep_world_size, + "ep_rank_id": self.ep_rank_id, + } + if self.enable_dispatch_v2: + stage3_kwargs.update({ + "assist_info_for_combine": + self.assist_info_for_combine, + }) + else: + stage3_kwargs.update({ + "expand_idx": self.assist_info_for_combine, + }) + if self.need_extra_args: + stage3_kwargs.update({ + "tp_send_counts": tp_recv_counts, + "group_tp": self.moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if self.a3_need_extra_args and self.enable_dispatch_v2: + stage3_kwargs.update({ + "x_active_mask": self.mc2_mask, + }) + kwargs_mc2.update(stage3_kwargs) + return kwargs_mc2 + + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + kwargs_mc2 = self.get_combine_mc_kwargs(hidden_states) + hidden_states = torch_npu.npu_moe_distribute_combine_v2( + **kwargs_mc2 + ) if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine( + **kwargs_mc2) + if self.shared_experts is None: + return hidden_states + else: + if self.with_quant: + shared_hidden_states, _ = self.shared_experts.down_proj( + (self.shared_act, self.swiglu_out_scale)) + else: + shared_hidden_states, _ = self.shared_experts.down_proj( + self.shared_act) + return hidden_states, shared_hidden_states + + +class TokenDispatcherWithAllGather(MoETokenDispatcher): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.apply_router_weight_on_input = False + self.max_num_tokens = kwargs.get("max_num_tokens") + self.num_experts_local = kwargs.get("num_local_experts", 0) + self.sorted_weights = None + self.expanded_row_idx = None + self.sorted_token_indices = None + self.original_shape = None + self.mask = None + self.expert_map = None + self.topk_weights = None + self.topk_ids = None + self.with_quant = False + + def token_dispatch(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[torch.Tensor] = None, + shared_dequant_scale: Optional[torch.Tensor] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + self.with_quant = with_quant + self.original_shape = hidden_states.shape + + num_tokens = hidden_states.shape[:-1].numel() + dtype = hidden_states.dtype + device = hidden_states.device + self.expert_map = expert_map + self.topk_weights = topk_weights + self.topk_ids = topk_ids + self.apply_router_weight_on_input = apply_router_weight_on_input + if self.apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + hidden_states = hidden_states * \ + topk_weights.to(hidden_states.dtype) + + if expert_map is not None: + # Generate token indices and flatten + token_indices = (torch.arange( + num_tokens, device=device, + dtype=torch.int64).unsqueeze(1).expand(-1, + self.top_k).reshape(-1)) + + # Flatten token-to-expert mappings and map to local experts + weights_flat = topk_weights.view(-1) + experts_flat = topk_ids.view(-1) + local_experts_flat = expert_map[experts_flat] + + # Filter valid token-expert pairs + self.mask = local_experts_flat != -1 + filtered_weights = torch.where( + self.mask, weights_flat, + torch.zeros_like(weights_flat)).to(dtype) + filtered_experts = torch.where( + self.mask, local_experts_flat, + torch.full_like(local_experts_flat, + self.num_experts_local)).to(topk_ids.dtype) + + # Sort by local expert IDs + sort_indices = torch.argsort(filtered_experts.view(torch.float32)) + self.sorted_token_indices = token_indices[sort_indices] + self.sorted_weights = filtered_weights[sort_indices] + + # Compute token counts with minlength of num_experts + # This is equivalent to but faster than: + # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1] + token_counts = torch.zeros(self.num_experts_local + 1, + device=device, + dtype=torch.int64) + ones = torch.ones_like(filtered_experts, dtype=torch.int64) + token_counts.scatter_add_(0, filtered_experts.to(torch.int64), + ones) + token_counts = token_counts[:self.num_experts_local] + + # Rearrange hidden_states + sorted_hidden_states = hidden_states[self.sorted_token_indices] + if self.with_quant: + group_list_type = 1 + expert_tokens = token_counts + else: + expert_tokens = torch.cumsum(token_counts, + dim=0, + dtype=torch.int64) + group_list_type = 0 + else: + active_num = self.max_num_tokens if self.max_num_tokens is not None else num_tokens + sorted_hidden_states, self.expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=active_num) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, self.num_experts_local) + expert_tokens = expert_tokens.to(torch.int64) + group_list_type = 0 + return { + "group_list_type": group_list_type, + "hidden_states": sorted_hidden_states, + "group_list": expert_tokens, + } + + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + assert self.original_shape is not None + dtype = hidden_states.dtype + device = hidden_states.device + if self.expert_map is not None: + assert self.mask is not None + assert self.sorted_token_indices is not None + assert self.sorted_weights is not None + + weighted_down_out = hidden_states * \ + self.sorted_weights.unsqueeze(1) + + final_hidden_states = torch.zeros(*self.original_shape, + device=hidden_states.device, + dtype=hidden_states.dtype) + + # TODO: npu_grouped_matmul output random values at [num_valid_tokens:, ...] + # This created multiple NaN and index_add_ will mix them up which harms accuracy + # remove this mask and filter after it being fixed + num_valid_tokens = self.mask.sum() + valid_token_mask = torch.arange( + 0, self.sorted_token_indices.shape[0], + device=device).unsqueeze(1) < num_valid_tokens + valid_output = torch.where( + valid_token_mask, weighted_down_out, + torch.zeros_like(weighted_down_out)).to(dtype) + final_hidden_states.index_add_(0, self.sorted_token_indices, + valid_output) + else: + if self.with_quant: + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=self.topk_weights, + expanded_src_to_dst_row=self.expanded_row_idx, + export_for_source_row=self.topk_ids, + ) + if len(self.original_shape) == 3: + final_hidden_states = final_hidden_states.view( + self.original_shape) + else: + scales = torch.ones_like( + self.topk_weights + ) if self.apply_router_weight_on_input else self.topk_weights + # TODO: Reorder device memory 2 times here, replace the current + # implementation here when suitable operators become available. + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=scales, + expanded_src_to_dst_row=self.expanded_row_idx, + export_for_source_row=self.topk_ids, + ) + return final_hidden_states + + +# mypy: disable-error-code="override" +class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.apply_router_weight_on_input = False + self.local_ep = 1 + self.local_num_experts = self.num_experts // self.local_ep + self.local_num_group = self.top_k // self.local_ep + self.bsz = None + + def token_dispatch(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[torch.Tensor] = None, + shared_dequant_scale: Optional[torch.Tensor] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + self.apply_router_weight_on_input = apply_router_weight_on_input + if self.apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + hidden_states = hidden_states * \ + topk_weights.to(hidden_states.dtype) + + self.bsz, _ = hidden_states.shape + flatten_topk_ids = topk_ids.view(-1) + self.sorted_topk_ids = torch.argsort(flatten_topk_ids.float()) + self.sorted_topk_ids = self.sorted_topk_ids.to(torch.int32) + sorted_hidden_states = hidden_states.index_select( + 0, self.sorted_topk_ids // self.local_num_group) + + experts_id = torch.arange(0, + self.local_num_experts, + dtype=topk_ids.dtype, + device=topk_ids.device) + num_tokens_per_expert = ( + flatten_topk_ids.unsqueeze(-1) == experts_id).to( + torch.float32).sum(0) + topk_scales = topk_weights.view(-1).index_select( + 0, self.sorted_topk_ids).unsqueeze(-1) + group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64) + group_list_type = 0 + return { + "group_list_type": group_list_type, + "hidden_states": sorted_hidden_states, + "group_list": group_list, + "topk_scales": topk_scales, + } + + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + unsorted_topk_ids = torch.argsort(self.sorted_topk_ids.float()).to( + torch.int32) + unsorted_hidden_states = hidden_states.index_select( + 0, unsorted_topk_ids) + final_hidden_states = unsorted_hidden_states.reshape( + self.bsz, self.top_k // self.local_ep, -1).sum(1) + return final_hidden_states + + +class TokenDispatcherWithAll2AllV(MoETokenDispatcher): + """ + The implementation of the AlltoAll-based token dispatcher, which handles token + dispatching on the sequence level instead of token level. The core of this implementation + lies in each device dispatching on the entire sequence, with the hidden state being partitioned. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.with_quant = False + self.num_local_experts = kwargs.get("num_local_experts", 0) + self.num_global_redundant_experts = kwargs.get( + "num_global_redundant_experts", 0) + self.num_experts = self.num_experts + self.num_global_redundant_experts + + self.hidden_shape = None + self.topk_weights = None + self.input_splits = None + self.output_splits = None + self.hidden_shape_before_permute = None + + # [tp_ep_size * ep_size, num_local_experts]. Represents the number of tokens sent + # to each local expert by all ranks. + self.num_global_tokens_per_local_expert = None + + # cached intermediate tensors. + self.tokens_per_expert = None + self.global_input_tokens_local_experts_indices = None + + assert self.num_local_experts > 0, "Expected at least one expert" + if self.num_local_experts > 1: + self.expert_ids_per_ep_rank = torch.tensor( + [i % self.num_local_experts for i in range(self.num_experts)], + dtype=torch.int32, + device=torch.npu.current_device(), + ) + + local_expert_indices_offset = (self.ep_rank * self.num_local_experts) + + self.local_expert_indices = [ + local_expert_indices_offset + i + for i in range(self.num_local_experts) + ] + assert (len(self.local_expert_indices) == self.num_local_experts + ), "Invalid local expert indices" + for i in range(len(self.local_expert_indices) - 1): + assert (self.local_expert_indices[i] == + self.local_expert_indices[i + 1] - + 1), "local_expert_indices must be continuous" + + def token_dispatch(self, + hidden_states: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + row_idx: torch.Tensor, + expert_map: Optional[torch.Tensor] = None, + log2phy: Optional[torch.Tensor] = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[torch.Tensor] = None, + shared_gate_up: Optional[torch.Tensor] = None, + shared_dequant_scale: Optional[torch.Tensor] = None, + mc2_mask: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + with_quant: bool = False): + self.with_quant = with_quant + self.hidden_shape = hidden_states.shape + self.topk_weights = topk_weights + assert topk_weights.dim() == 2, "Expected 2D tensor for topk_weights" + assert topk_ids.dim() == 2, "Expected 2D tensor for routing map" + + if log2phy is not None: + topk_ids = log2phy[topk_ids] + + permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert = self._dispatch_preprocess( + hidden_states, topk_ids) + self.reversed_local_input_permutation_mapping = reversed_local_input_permutation_mapping + + dynamic_scale_after_all2all = None + if self.with_quant: + permutated_local_input_tokens, dynamic_scale = torch_npu.npu_dynamic_quant( + permutated_local_input_tokens) + + _, dynamic_scale_after_all2all, permute2_ep_all_to_all_handle = async_all_to_all( + dynamic_scale, + self.output_splits, + self.input_splits, + self.ep_group, + ) + permute2_ep_all_to_all_handle.wait() + dynamic_scale.untyped_storage().resize_(0) + + _, global_input_tokens, permute1_ep_all_to_all_handle = async_all_to_all( + permutated_local_input_tokens, + self.output_splits, + self.input_splits, + self.ep_group, + ) + permute1_ep_all_to_all_handle.wait() + permutated_local_input_tokens.untyped_storage().resize_(0) + + global_input_tokens, dynamic_scale = self._dispatch_postprocess( + global_input_tokens, dynamic_scale_after_all2all) + return { + "hidden_states": global_input_tokens, + "group_list": tokens_per_expert, + "dynamic_scale": dynamic_scale, + "group_list_type": 1 + } + + def token_combine(self, + hidden_states: torch.Tensor, + bias: torch.Tensor = None): + assert bias is None, "Bias is not supported in MoEAlltoAllvTokenDispatcher." + + hidden_states = self._combine_preprocess(hidden_states) + + # Perform expert parallel AlltoAll communication + # hidden_states: [SEQL, H] -> [SEQL, H/TP] + _, permutated_local_input_tokens, handle = async_all_to_all( + hidden_states, self.input_splits, self.output_splits, + self.ep_group) + handle.wait() + hidden_states.untyped_storage().resize_(0) + + output = self._combine_postprocess(permutated_local_input_tokens) + + self.input_splits = None + self.output_splits = None + self.num_global_tokens_per_local_expert = None + + return output + + def _dispatch_preprocess(self, hidden_states, topk_ids): + assert self.hidden_shape is not None + hidden_states = hidden_states.view(-1, self.hidden_shape[-1]) + tokens_per_expert = self._preprocess(topk_ids) + + self.hidden_shape_before_permute = hidden_states.shape + + permutated_local_input_tokens, reversed_local_input_permutation_mapping = torch_npu.npu_moe_token_permute( + tokens=hidden_states, + indices=topk_ids, + num_out_tokens=self.num_out_tokens, + ) + return permutated_local_input_tokens, reversed_local_input_permutation_mapping, tokens_per_expert + + def _preprocess(self, topk_ids: torch.Tensor) -> torch.Tensor: + num_local_tokens_per_expert = torch.histc(topk_ids, + bins=self.num_experts, + min=0, + max=self.num_experts) + + ep_size = self.ep_size + + # Dropless + self.num_out_tokens = topk_ids.numel() + + # =================================================== + # Calculate input_splits, output_splits for alltoall-v. + # =================================================== + self.input_splits = (num_local_tokens_per_expert.reshape( + ep_size, + self.num_local_experts).sum(axis=1).to(torch.device("cpu"), + non_blocking=True).numpy()) + num_global_tokens_per_expert = gather_from_sequence_parallel_region( + num_local_tokens_per_expert, + group=self.ep_group).reshape(ep_size, self.num_experts) + self.num_global_tokens_per_local_expert = num_global_tokens_per_expert[:, self.local_expert_indices[ + 0]:self.local_expert_indices[-1] + 1] + if self.num_global_tokens_per_local_expert is None: + raise ValueError( + "num_global_tokens_per_local_expert must be set before sum.") + self.output_splits = (self.num_global_tokens_per_local_expert.sum( + axis=-1).to(torch.device("cpu"), non_blocking=True).numpy()) + num_tokens_per_local_expert = self.num_global_tokens_per_local_expert.sum( + axis=0) + # =================================================== + # num_global_tokens_per_expert: [ep_size, num_experts] + # num_global_tokens_per_local_expert: [ep_size, num_local_experts] + # num_tokens_per_local_expert: [num_local_experts] + # =================================================== + + if self.num_local_experts > 1: + if self.num_global_tokens_per_local_expert is None: + raise ValueError( + "num_global_tokens_per_local_expert must be set before operations." + ) + self.global_input_tokens_local_experts_indices = torch.repeat_interleave( + self.expert_ids_per_ep_rank, + self.num_global_tokens_per_local_expert.ravel()) + + return num_tokens_per_local_expert + + def _dispatch_postprocess(self, global_input_tokens, dynamic_scale=None): + # Early return if no local experts or no tokens + if self.num_local_experts <= 1: + return global_input_tokens, None + + # Handle quantized case + if self.with_quant: + assert self.global_input_tokens_local_experts_indices is not None, \ + "global_input_tokens_local_experts_indices must be initialized before calling _dispatch_postprocess" + expert_idx_2d = self.global_input_tokens_local_experts_indices.unsqueeze( + -1) + active_num = self.global_input_tokens_local_experts_indices.numel() + + # Handle case with no active tokens + if active_num <= 0: + self.reversed_global_input_permutation_mapping = self.global_input_tokens_local_experts_indices + return global_input_tokens, dynamic_scale + + # Process with active tokens + global_input_tokens, self.reversed_global_input_permutation_mapping, _, expanded_scale = torch_npu.npu_moe_init_routing_v2( + global_input_tokens, + expert_idx_2d, + scale=dynamic_scale, + active_num=active_num, + expert_capacity=0, + expert_num=self.num_local_experts, + expert_tokens_num_type=1, + expert_tokens_num_flag=True, + active_expert_range=[0, self.num_local_experts], + quant_mode=-1, + row_idx_type=0) + return global_input_tokens, expanded_scale + + # Handle non-quantized case + global_input_tokens, self.reversed_global_input_permutation_mapping = torch_npu.npu_moe_token_permute( + global_input_tokens, + self.global_input_tokens_local_experts_indices) + return global_input_tokens, None + + def _combine_preprocess(self, hidden_states): + # Unpermutation 2: expert output to AlltoAll input + if hidden_states.shape[0] > 0 and self.num_local_experts > 1: + hidden_states = torch_npu.npu_moe_token_unpermute( + hidden_states, self.reversed_global_input_permutation_mapping) + + return hidden_states + + def _combine_postprocess(self, permutated_local_input_tokens): + # Unpermutation 1: AlltoAll output to output + output = torch_npu.npu_moe_token_unpermute( + permuted_tokens=permutated_local_input_tokens, + sorted_indices=self.reversed_local_input_permutation_mapping.to( + torch.int32), + probs=self.topk_weights, + restore_shape=self.hidden_shape_before_permute) + + # Reshape the output tensor + output = output.view(self.hidden_shape) + return output diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py new file mode 100644 index 0000000..89e2bc7 --- /dev/null +++ b/vllm_ascend/ops/rotary_embedding.py @@ -0,0 +1,339 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import math +from typing import Optional, Tuple + +import torch +import torch_npu +from vllm.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, RotaryEmbedding) + +from vllm_ascend.platform import NPUPlatform +from vllm_ascend.utils import enable_custom_op, is_310p + + +def _custom_rotary_embedding_enabled(query, neox_style, head_size): + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) + + +def _rope_forward_oot( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + is_neox_style_override: Optional[bool] = None, +) -> Tuple[torch.Tensor, torch.Tensor]: + query_shape, key_shape = query.shape, key.shape + if self.cos_sin_cache.device != query.device: + self.cos_sin_cache = self.cos_sin_cache.to(query.device) + if self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.dtype) + neox_style = self.is_neox_style + if is_neox_style_override is not None: + neox_style = is_neox_style_override + # adopt custom kernel path for rotary_embedding + if _custom_rotary_embedding_enabled(query, neox_style, + self.head_size) and not is_310p(): + query, key = torch.ops._C.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + neox_style, + ) + return query.view(query_shape), key.view(key_shape) + if offsets is not None: + raise NotImplementedError( + "Batched rotary embedding is currently not supported on NPU.") + else: + if self.rotary_dim < self.head_size: + num_tokens = query.shape[0] + query = query.view(num_tokens, -1, self.head_size) + key = key.view(num_tokens, -1, self.head_size) + q_rot = query[..., :self.rotary_dim] + q_pass = query[..., self.rotary_dim:] + k_rot = key[..., :self.rotary_dim] + k_pass = key[..., self.rotary_dim:] + q_rot = q_rot.contiguous().view(num_tokens, -1) + k_rot = k_rot.contiguous().view(num_tokens, -1) + torch_npu._npu_rotary_embedding( + positions, + q_rot, + k_rot, + self.head_size, + self.cos_sin_cache, + neox_style, + ) + q_rot = q_rot.view(num_tokens, -1, self.rotary_dim) + k_rot = k_rot.view(num_tokens, -1, self.rotary_dim) + q = torch.cat((q_rot, q_pass), dim=-1).reshape(query_shape) + k = torch.cat((k_rot, k_pass), dim=-1).reshape(key_shape) + return q, k + # TODO: Remove the contiguous in the future. + query = query.contiguous().view(query.shape[0], -1) + key = key.contiguous().view(key.shape[0], -1) + torch_npu._npu_rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + neox_style, + ) + return query.view(query_shape), key.view(key_shape) + + +class AscendRotaryEmbedding(RotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, + ) -> None: + super().__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + + def forward_oot( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + is_neox_style_override: Optional[bool] = None, + ): + return _rope_forward_oot( + self, + positions, + query, + key, + offsets, + is_neox_style_override, + ) + + +class AscendDeepseekScalingRotaryEmbedding(DeepseekScalingRotaryEmbedding): + + def __init__( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + mscale: float = 1, + mscale_all_dim: float = 0, + ) -> None: + # Note: we adopt the native huggingface deepseek rope initialization code from + # https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py for + # its more ascend compute friendly + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + self._yarn_get_mscale(self.scaling_factor, float(mscale)) / + self._yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * + attn_factor) + super(DeepseekScalingRotaryEmbedding, + self).__init__(head_size, rotary_dim, max_position_embeddings, + base, is_neox_style, dtype) + self.max_seq_len = max_position_embeddings + self._set_cos_sin_cache(seq_len=max_position_embeddings, + device=NPUPlatform.device_type, + dtype=dtype) + + def _yarn_get_mscale(self, scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + def _rotate_half(self, x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + def _yarn_linear_ramp_mask(self, min_value, max_value, dim): + # Note: The if conditional branch is not used here + # to solve MTP compilation error. + max_value += (min_value == max_value).float() * 0.001 + linear_func = (torch.arange(dim, dtype=torch.float32) - + min_value) / (max_value - min_value) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + # Inverse dim formula to find dim based on number of rotations + def _yarn_find_correction_dim(self, + num_rotations, + dim, + base=10000, + max_position_embeddings=2048): + # Note: use torch instead of math to solve MTP compilation error. + return (dim * torch.log( + torch.tensor(max_position_embeddings) / + (num_rotations * 2 * torch.pi))) / (2 * + torch.log(torch.tensor(base))) + + # Find dim range bounds based on rotations + def _yarn_find_correction_range(self, + low_rot, + high_rot, + dim, + base=10000, + max_position_embeddings=2048): + # Note: use torch instead of math to solve MTP compilation error. + low = torch.floor( + self._yarn_find_correction_dim(low_rot, dim, base, + max_position_embeddings)) + high = torch.ceil( + self._yarn_find_correction_dim(high_rot, dim, base, + max_position_embeddings)) + # Note: use torch instead of max/min to solve MTP compilation error. + return torch.clamp(low, min=0), torch.clamp(high, max=dim - 1) + + # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb + def _apply_rotary_pos_emb(self, + q, + k, + cos, + sin, + position_ids, + unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids] + sin = sin[position_ids] + cos = cos[:, None, None, :] + sin = sin[:, None, None, :] + + if len(q.shape) == 3: + q = q[:, :, None, :] + if len(k.shape) == 2: + k = k[:, None, None, :] + elif len(k.shape) == 3: + k = k[:, :, None, :] + + b, h_q, s, d = q.shape + q = q.view(b, h_q, s, d // 2, 2).transpose(4, 3).reshape(b, h_q, s, d) + + b, h_k, s, d = k.shape + k = k.view(b, h_k, s, d // 2, 2).transpose(4, 3).reshape(b, h_k, s, d) + + q_embed = (q * cos) + (self._rotate_half(q) * sin) + k_embed = (k * cos) + (self._rotate_half(k) * sin) + + q_embed = q_embed.view(b, h_q, d) + k_embed = k_embed.view(b, h_k, d) + + return q_embed, k_embed + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.rotary_dim + + freq_extra = 1.0 / (self.base**( + torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)) + freq_inter = 1.0 / (self.scaling_factor * self.base**( + torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)) + + low, high = self._yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.max_position_embeddings, + ) + inv_freq_mask = 1.0 - self._yarn_linear_ramp_mask( + low, high, dim // 2).to(device=device, dtype=torch.float32) + inv_freq = freq_inter * (1 - + inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len * self.scaling_factor, + device=device, + dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale + sin_cached = torch.cat([freqs, freqs], dim=-1).sin() * self.mscale + cos_cached = cos_cached.to(dtype) + sin_cached = sin_cached.to(dtype) + cache = torch.cat( + [freqs.cos() * self.mscale, + freqs.sin() * self.mscale], dim=-1).to(dtype) + self.register_buffer("cos_sin_cache", cache, persistent=False) + self.register_buffer("cos_cached", cos_cached, persistent=False) + self.register_buffer("sin_cached", sin_cached, persistent=False) + + def forward(self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + max_seq_len: Optional[int] = None): + if max_seq_len is not None and max_seq_len > self.max_seq_len: + self._set_cos_sin_cache(max_seq_len, query.device, query.dtype) + if len(key.shape) == 2: + key = key[:, None, :] + # Note: we implement the non neox_style method with shuffle the last dim and neox style + # calculation method which is also more compute friendly to the ascend machine + # https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py + neox_style = True + if self.is_neox_style is False: + b, h_q, d = query.shape + query = query.view(b, h_q, d // 2, + 2).transpose(3, 2).reshape(b, h_q, d) + b, h_k, d = key.shape + key = key.view(b, h_k, d // 2, 2).transpose(3, + 2).reshape(b, h_k, d) + q_pe, k_pe = _rope_forward_oot(self, positions, query, key, offsets, + neox_style) + return q_pe, k_pe diff --git a/vllm_ascend/ops/sequence_parallel.py b/vllm_ascend/ops/sequence_parallel.py new file mode 100644 index 0000000..bfd327b --- /dev/null +++ b/vllm_ascend/ops/sequence_parallel.py @@ -0,0 +1,120 @@ +import torch +from torch.nn import functional as F +from vllm.distributed import (get_tensor_model_parallel_world_size, + get_tp_group, tensor_model_parallel_all_gather, + tensor_model_parallel_reduce_scatter) +from vllm.forward_context import get_forward_context + +from vllm_ascend.platform import NPUPlatform + + +class MetadataForPadding: + + def __init__(self, + padding_flag=False, + lengths_sum_padding=0, + lengths_sum_unpadding=0, + pad_size=0, + not_dummy_and_is_prefill=False): + self.padding_flag = padding_flag + self.not_dummy_and_is_prefill = not_dummy_and_is_prefill + + self.lengths_sum_padding = lengths_sum_padding + self.lengths_sum_unpadding = lengths_sum_unpadding + self.pad_size = pad_size + + self.tp_size = get_tp_group().world_size + self.tp_rank_in_group = get_tp_group().rank_in_group + + assert self.lengths_sum_padding % self.tp_size == 0 + self.slice_size = self.lengths_sum_padding // self.tp_size + + self.mc2_mask = torch.zeros( + self.lengths_sum_padding, + dtype=torch.bool, + device=NPUPlatform.device_type, + ) + self.mc2_mask[:lengths_sum_unpadding] = True + + def padding_aligned_reduce_scatter(self, + data: torch.Tensor) -> torch.Tensor: + if self.padding_flag: + pad_size = self.pad_size + padded_data = F.pad(data, (0, 0, 0, pad_size)) + else: + padded_data = data + padded_data_reduce_scatter = tensor_model_parallel_reduce_scatter( + padded_data, 0) + + return padded_data_reduce_scatter + + def allgather_unpadding_aligned(self, + padded_data: torch.Tensor) -> torch.Tensor: + padded_data_allgather = tensor_model_parallel_all_gather( + padded_data, 0) + if self.padding_flag: + lengths_sum_unpadding = self.lengths_sum_unpadding + unpadding_data = padded_data_allgather[:lengths_sum_unpadding] + else: + unpadding_data = padded_data_allgather + return unpadding_data + + def padding_slice(self, data: torch.Tensor) -> torch.Tensor: + + padded_data = F.pad(data, (0, 0, 0, self.pad_size)) + start = self.tp_rank_in_group * self.slice_size + end = start + self.slice_size + slice_data = padded_data[start:end] + + return slice_data + + def padding_aligned_scatter(self, data: torch.Tensor) -> torch.Tensor: + if self.padding_flag: + pad_size = self.pad_size + padded_data = F.pad(data, (0, 0, 0, pad_size)) + else: + padded_data = data + # padded_data = data + padded_data = torch.tensor_split(padded_data, self.tp_size, dim=0) + + padded_data_reduce_scatter = padded_data[self.tp_rank_in_group] + + return padded_data_reduce_scatter + + +def init_metadata_for_sp(input_ids, enable_sequence_parallelism): + if not enable_sequence_parallelism: + return MetadataForPadding(padding_flag=False, + not_dummy_and_is_prefill=False) + + is_perifll = 0 + attn_metadata = get_forward_context().attn_metadata + tp_size = get_tensor_model_parallel_world_size() + if attn_metadata is not None: + if hasattr(attn_metadata, + 'is_only_prefill') and attn_metadata.is_only_prefill: + is_perifll = 1 + if hasattr(attn_metadata, + 'num_prefills') and attn_metadata.num_prefills > 0: + is_perifll = 1 + + if is_perifll: + lengths_sum_unpadding = input_ids.shape[0] + lengths_sum_padding = ( + (lengths_sum_unpadding + tp_size - 1) // tp_size) * tp_size + if lengths_sum_unpadding == lengths_sum_padding: + padding_flag = False + else: + padding_flag = True + pad_size = lengths_sum_padding - lengths_sum_unpadding + _metadata_for_padding = MetadataForPadding( + lengths_sum_unpadding=lengths_sum_unpadding, + lengths_sum_padding=lengths_sum_padding, + padding_flag=padding_flag, + pad_size=pad_size, + not_dummy_and_is_prefill=True) + + return _metadata_for_padding + + return MetadataForPadding(padding_flag=False, + not_dummy_and_is_prefill=False) diff --git a/vllm_ascend/ops/vocab_parallel_embedding.py b/vllm_ascend/ops/vocab_parallel_embedding.py new file mode 100644 index 0000000..7ad35dc --- /dev/null +++ b/vllm_ascend/ops/vocab_parallel_embedding.py @@ -0,0 +1,254 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Tuple + +import torch +from torch import nn +from torch.nn.parameter import Parameter +from vllm.distributed import divide, tensor_model_parallel_all_reduce +from vllm.distributed.parallel_state import get_tp_group +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding) +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, UnquantizedEmbeddingMethod, + VocabParallelEmbedding, pad_vocab_size) +from vllm.model_executor.utils import set_weight_attrs + +from vllm_ascend.distributed.parallel_state import get_lmhead_tp_group +from vllm_ascend.utils import lmhead_tp_enable + + +class AscendVocabParallelEmbedding(VocabParallelEmbedding): + """ + Register VocabParallelEmbedding as a custom op for Ascend. + AscendVocabParallelEmbedding support different communication parallel groups + Added the feature of lmheadTP in pure dp scenario + """ + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + nn.Module.__init__(self) + + if lmhead_tp_enable() and prefix.find("lm_head") != -1: + self.comm_group = get_lmhead_tp_group() + else: + self.comm_group = get_tp_group() + + self.tp_size = self.comm_group.world_size + self.tp_rank = self.comm_group.rank_in_group + + self.num_embeddings = num_embeddings + self.padding_size = padding_size + self.org_vocab_size = org_num_embeddings or num_embeddings + num_added_embeddings = num_embeddings - self.org_vocab_size + self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, + self.padding_size) + self.num_embeddings_padded = pad_vocab_size( + self.org_vocab_size_padded + num_added_embeddings, + self.padding_size) + assert self.org_vocab_size_padded <= self.num_embeddings_padded + + self.shard_indices = self._get_indices(self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + self.tp_rank, self.tp_size) + self.embedding_dim = embedding_dim + quant_method = None + if quant_config is not None: + quant_method = quant_config.get_quant_method(self, prefix=prefix) + if quant_method is None: + quant_method = UnquantizedEmbeddingMethod() + + # If we are making an embedding layer, then our quantization linear + # method must implement the embedding operation. If we are another + # layer type like ParallelLMHead, this is not important. + is_embedding_layer = type(self) is VocabParallelEmbedding + quant_method_implements_embedding = method_has_implemented_embedding( + type(quant_method)) + if is_embedding_layer and not quant_method_implements_embedding: + raise NotImplementedError( + f"The class {type(quant_method).__name__} must implement " + "the 'embedding' method, see UnquantizedEmbeddingMethod.") + + self.quant_method: QuantizeMethodBase = quant_method + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + # Divide the weight matrix along the vocaburaly dimension. + self.num_added_embeddings = self.num_embeddings - self.org_vocab_size + self.num_embeddings_per_partition = divide(self.num_embeddings_padded, + self.tp_size) + assert (self.shard_indices.num_elements_padded == + self.num_embeddings_per_partition) + self.num_org_embeddings_per_partition = ( + self.shard_indices.org_vocab_end_index - + self.shard_indices.org_vocab_start_index) + self.num_added_embeddings_per_partition = ( + self.shard_indices.added_vocab_end_index - + self.shard_indices.added_vocab_start_index) + + self.quant_method.create_weights(self, + self.embedding_dim, + [self.num_embeddings_per_partition], + self.embedding_dim, + self.num_embeddings_padded, + params_dtype=params_dtype, + weight_loader=self.weight_loader) + + def _get_masked_input_and_mask( + self, input_: torch.Tensor, org_vocab_start_index: int, + org_vocab_end_index: int, num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: + # torch.compile will fuse all of the pointwise ops below + # into a single kernel, making it very fast + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) + # Adapt: avoid create added_vocab_mask when added_vocab_start_index == added_vocab_end_index. + if added_vocab_start_index == added_vocab_end_index: + valid_offset = (org_vocab_start_index * org_vocab_mask) + vocab_mask = org_vocab_mask + else: + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index) + added_offset = added_vocab_start_index - ( + org_vocab_end_index - + org_vocab_start_index) - num_org_vocab_padding + valid_offset = (org_vocab_start_index * + org_vocab_mask) + (added_offset * added_vocab_mask) + vocab_mask = org_vocab_mask | added_vocab_mask + # Adapt end. + input_ = vocab_mask * (input_ - valid_offset) + return input_, ~vocab_mask + + def forward(self, input_): + if self.tp_size > 1: + # Build the mask. + masked_input, input_mask = self._get_masked_input_and_mask( + input_, self.shard_indices.org_vocab_start_index, + self.shard_indices.org_vocab_end_index, + self.shard_indices.num_org_vocab_padding, + self.shard_indices.added_vocab_start_index, + self.shard_indices.added_vocab_end_index) + else: + masked_input = input_ + # Get the embeddings. + output_parallel = self.quant_method.embedding(self, + masked_input.long()) + # Mask the output embedding. + if self.tp_size > 1: + output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0) + # Reduce across all the model parallel GPUs. + output = tensor_model_parallel_all_reduce(output_parallel) + return output + + +class AscendParallelLMHead(ParallelLMHead): + """ + Register ParallelLMHead as a custom op for Ascend.""" + + def __init__(self, + num_embeddings: int, + embedding_dim: int, + bias: bool = False, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + AscendVocabParallelEmbedding.__init__(self, num_embeddings, + embedding_dim, params_dtype, + org_num_embeddings, padding_size, + quant_config, prefix) + + self.quant_config = quant_config + if bias: + self.bias = Parameter( + torch.empty(self.num_embeddings_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + +class AscendLogitsProcessor(LogitsProcessor): + """ + Register LogitsProcessor as a custom op for Ascend. + Added the feature of lmheadTP in pure dp scenario + """ + + def _get_logits( + self, + hidden_states: torch.Tensor, + lm_head: AscendParallelLMHead, + embedding_bias: Optional[torch.Tensor] = None, + ) -> Optional[torch.Tensor]: + if lmhead_tp_enable(): + return self._get_logits_lmheadtp(hidden_states, lm_head, + embedding_bias) + else: + return self._get_logits_normal(hidden_states, lm_head, + embedding_bias) + + def _get_logits_lmheadtp( + self, + hidden_states: torch.Tensor, + lm_head: AscendParallelLMHead, + embedding_bias: Optional[torch.Tensor], + ) -> Optional[torch.Tensor]: + # Gather hidden states from all devices in tensor parallel group + gathered_hidden_states = get_lmhead_tp_group().all_gather( + hidden_states, dim=0) + local_logits = lm_head.quant_method.apply(lm_head, + gathered_hidden_states, + bias=embedding_bias) + # Gather logits for tensor parallel + logits = get_lmhead_tp_group().all_to_all(local_logits) + # Remove paddings in vocab (if any) + if logits is not None: + logits = logits[..., :self.org_vocab_size] + return logits + + def _get_logits_normal( + self, + hidden_states: torch.Tensor, + lm_head: AscendParallelLMHead, + embedding_bias: Optional[torch.Tensor], + ) -> Optional[torch.Tensor]: + local_logits = lm_head.quant_method.apply(lm_head, + hidden_states, + bias=embedding_bias) + # Gather logits for tensor parallel + logits = self._gather_logits(local_logits) + + # Remove paddings in vocab (if any) + if logits is not None: + logits = logits[..., :self.org_vocab_size] + + return logits diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py new file mode 100644 index 0000000..754a344 --- /dev/null +++ b/vllm_ascend/patch/__init__.py @@ -0,0 +1,104 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ---------------------------------------------------------------------------------- +# This module manage the patch for vllm. There are two folders in this module: +# - platform: contains the patches applied before worker starts. It's called by +# `vllm_ascend.utils.adapt_patch(is_global_patch=True)` in +# `vllm_ascend.platform.NPUPlatform.pre_register_and_update()` function. +# - worker: contains the patches applied when worker starts. It's called by +# `vllm_ascend.utils.adapt_patch(is_global_patch=False)` in +# each worker's `__init__` function. +# +# Then in each kind of patch, there are three folders: +# - patch_0_10_0: contains the patches applied when vllm version is 0.10.0. +# - patch_main: contains the patches applied when vllm version is main branch. +# - patch_common: contains the patches applied in both 0.10.0 and main branch. +# +# Once a new patch is added in vllm-ascend, please add the patch description into this file as well. +# ---------------------------------------------------------------------------------- + +# What's Patched and how it works: +# -------------------------------- +# * Platform Patch: +# ================= +# ** File: platform/patch_common/patch_distributed.py** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.config.ParallelConfig.get_next_dp_init_port` +# Why: +# vllm doesn't support get port from environment. +# How: +# Add the logic to get port from environment. +# Related PR (if no, explain why): +# Need a PR to vllm to support get port from environment. +# Future Plan: +# Remove those patch when vllm merged them +# +# * Worker Patch: +# =============== +# ** File: worker/patch_common/patch_minicpm.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.models.minicpm.MiniCPMAttention.forward` +# Why: +# The forward func of MiniCPMAttention in vllm do a datatype convert +# (original datatype --> float32) to ensure the precision on cuda. +# However float32 is not supported in cann rope op, thus we keep this patch +# How: +# Removed the dtype convert operations in forward +# Related PR (if no, explain why): +# NO, only for npu due to rope op. +# Future Plan: +# Keep this patch in vllm-ascend. +# +# ** File: worker/patch_common/patch_distributed.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.distributed.parallel_state.GroupCoordinator` +# Why: +# vllm doesn't support all_to_all for GroupCoordinator. +# How: +# Add all_to_all implementation for GroupCoordinator. +# Related PR (if no, explain why): +# Need a PR to vllm to support all_to_all for GroupCoordinator. +# Future Plan: +# Remove this patch when vllm merged them. +# +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.v1.sample.sampler.Sampler.gather_logprobs` +# Why: +# We need to patch gather_logprobs to make sure call batched_count_greater_than +# with backend=current_platform.simple_compile_backend +# How: +# Patch gather_logprobs call new batched_count_greater_than +# Related PR (if no, explain why): +# - https://github.com/vllm-project/vllm/pull/21591 +# Future Plan: +# Revert it when vLLM merge #21591 and release new version +# ** File: worker/patch_common/patch_linear.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.model_executor.layers.linear.RowParallelLinear` +# Why: +# We need to fuse matmul and allreuce in `RowParallelLinear` +# to improve performance. +# How: +# Create a new class `AscendRowParallelLinear` that inherits from `RowParallelLinear`. +# In this class, we override the `forward` method to use +# torch_npu.npu_mm_all_reduce_base to replace matmul and allreduce. +# Related PR (if no, explain why): +# - https://github.com/vllm-project/vllm-ascend/pull/1926 +# Future Plan: +# Validate more models in all kinds of scenario, +# if performance is always improved, we can enable this patch by default and remove the env +# variable `VLLM_ASCEND_ENABLE_FUSE_MATMUL_ALLREDUCE` in the future. diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py new file mode 100644 index 0000000..c0772a8 --- /dev/null +++ b/vllm_ascend/patch/platform/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vllm_ascend.patch.platform import patch_common # noqa: F401 +from vllm_ascend.patch.platform import patch_main # noqa: F401 diff --git a/vllm_ascend/patch/platform/patch_common/__init__.py b/vllm_ascend/patch/platform/patch_common/__init__.py new file mode 100644 index 0000000..f88f2a9 --- /dev/null +++ b/vllm_ascend/patch/platform/patch_common/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import vllm_ascend.patch.platform.patch_common.patch_distributed # noqa diff --git a/vllm_ascend/patch/platform/patch_common/patch_distributed.py b/vllm_ascend/patch/platform/patch_common/patch_distributed.py new file mode 100644 index 0000000..67d4797 --- /dev/null +++ b/vllm_ascend/patch/platform/patch_common/patch_distributed.py @@ -0,0 +1,115 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from vllm/model_executor/models/qwen2_vl.py +# This file is a part of the vllm-ascend project. + +import torch +import vllm.envs as envs_vllm +from vllm.config import ParallelConfig + +from vllm_ascend.utils import is_310p + + +def parallel_config_get_dp_port(self) -> int: + """ + We might need to initialize process groups in multiple + processes that is related to data parallelism, + e.g. both in the worker and in the engine, which + can live in different processes. To avoid port conflicts, we + increment the port number each time we need to initialize a + new process group related to data parallelism. + """ + answer = self.data_parallel_master_port + self.data_parallel_master_port += 1 + + # NOTE: Get port from envs directly when using torchrun + port = envs_vllm.VLLM_DP_MASTER_PORT if envs_vllm.VLLM_DP_MASTER_PORT else answer + return port + + +ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port + + +class NullHandle: + + def __init__(self): + pass + + def wait(self): + pass + + +def communication_adaptation_310p(): + + def broadcast310p_wrapper(fn): + + def broadcast310p(tensor, src, group=None, async_op=False): + if tensor.device == torch.device('cpu'): + return fn(tensor, src, group, async_op) + rank = torch.distributed.get_rank(group) + world_size = torch.distributed.get_world_size(group) + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + tensor_list[rank] = tensor + torch.distributed.all_gather(tensor_list, tensor, group=group) + tensor[...] = tensor_list[src] + if async_op: + return NullHandle() + else: + return None + + return broadcast310p + + torch.distributed.broadcast = broadcast310p_wrapper( + torch.distributed.broadcast) + torch.distributed.distributed_c10d.broadcast = broadcast310p_wrapper( + torch.distributed.distributed_c10d.broadcast) + + def all_reduce_wrapper_310p(fn): + + def all_reduce( + tensor, + op=torch.distributed.ReduceOp.SUM, + group=None, + async_op=False, + ): + if tensor.dtype != torch.int64: + return fn(tensor, op, group, async_op) + rank = torch.distributed.get_rank(group) + world_size = torch.distributed.get_world_size(group) + tensor_list = [torch.empty_like(tensor) for _ in range(world_size)] + tensor_list[rank] = tensor + torch.distributed.all_gather(tensor_list, tensor, group=group) + if op == torch.distributed.ReduceOp.SUM: + return torch.stack(tensor_list).sum(0) + elif op == torch.distributed.ReduceOp.MAX: + return torch.tensor( + torch.stack(tensor_list).cpu().numpy().max(0), + device=tensor.device, + ) + else: + raise RuntimeError(f"not implement op {op}") + + return all_reduce + + torch.distributed.all_reduce = all_reduce_wrapper_310p( + torch.distributed.all_reduce) + torch.distributed.distributed_c10d.all_reduce = all_reduce_wrapper_310p( + torch.distributed.distributed_c10d.all_reduce) + + +if is_310p(): + communication_adaptation_310p() diff --git a/vllm_ascend/patch/platform/patch_main/__init__.py b/vllm_ascend/patch/platform/patch_main/__init__.py new file mode 100644 index 0000000..116c73c --- /dev/null +++ b/vllm_ascend/patch/platform/patch_main/__init__.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py new file mode 100644 index 0000000..d294f14 --- /dev/null +++ b/vllm_ascend/patch/worker/__init__.py @@ -0,0 +1,19 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm_ascend.patch.worker import patch_common # noqa: F401 +from vllm_ascend.patch.worker import patch_main # noqa: F401 diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py new file mode 100644 index 0000000..8d206bf --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -0,0 +1,22 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa +import vllm_ascend.patch.worker.patch_common.patch_linear # noqa +import vllm_ascend.patch.worker.patch_common.patch_logits # noqa +import vllm_ascend.patch.worker.patch_common.patch_lora_embedding # noqa +import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/patch/worker/patch_common/patch_distributed.py b/vllm_ascend/patch/worker/patch_common/patch_distributed.py new file mode 100644 index 0000000..846d82c --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_distributed.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional + +import torch +import vllm +from vllm.distributed.parallel_state import GroupCoordinator + + +class GroupCoordinatorPatch(GroupCoordinator): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def all_to_all(self, + input_: torch.Tensor, + scatter_dim: int = 0, + gather_dim: int = -1, + scatter_sizes: Optional[List[int]] = None, + gather_sizes: Optional[List[int]] = None) -> torch.Tensor: + if self.world_size == 1: + return input_ + assert -input_.dim() <= scatter_dim < input_.dim(), ( + f"Invalid scatter dim ({scatter_dim}) for input tensor with shape {input_.size()}" + ) + assert -input_.dim() <= gather_dim < input_.dim(), ( + f"Invalid gather dim ({gather_dim}) for input tensor with shape {input_.size()}" + ) + return self.device_communicator.all_to_all(input_, scatter_dim, + gather_dim, scatter_sizes, + gather_sizes) + + +vllm.distributed.parallel_state.GroupCoordinator = GroupCoordinatorPatch # Note: check the GroupCoordinator with online serving \ No newline at end of file diff --git a/vllm_ascend/patch/worker/patch_common/patch_linear.py b/vllm_ascend/patch/worker/patch_common/patch_linear.py new file mode 100644 index 0000000..5690ba8 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_linear.py @@ -0,0 +1,147 @@ +""" +Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +This file is a part of the vllm-ascend project. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from typing import Optional, Union + +import torch +import torch_npu +import vllm +from torch.distributed import ProcessGroup +from torch.nn.parameter import Parameter +from vllm.distributed import (get_tensor_model_parallel_rank, + split_tensor_along_last_dim) +from vllm.distributed.parallel_state import get_tp_group +from vllm.logger import logger +from vllm.model_executor.layers.linear import RowParallelLinear + +import vllm_ascend.envs as envs_ascend + +_HCOMM_INFO = None + + +class AscendRowParallelLinear(RowParallelLinear): + """ + AscendRowParallelLinear is a custom implementation of RowParallelLinear + that overrides the forward method to handle Ascend-specific operations. + """ + + def __init__(self, *args, **kwargs): + """Initialize the AscendRowParallelLinear layer. + + Args: + *args: Variable length argument list. + **kwargs: Arbitrary keyword arguments. + """ + tp_group = get_tp_group().device_group + hcomm_info = self.get_hcomm_info(tp_group) + self.hcomm_info = hcomm_info + super().__init__(*args, **kwargs) + self.weight_t = self.weight.t() + + @staticmethod + def get_hcomm_info(group: ProcessGroup) -> str: + """Get the HCCL communication information for the given group. + + Args: + group (ProcessGroup): The process group for which to get the HCCL communication info. + + Returns: + str: The HCCL communication name for the given group. + """ + global _HCOMM_INFO + if _HCOMM_INFO is not None: + return _HCOMM_INFO + + rank = torch.distributed.get_rank(group) + if torch.__version__ > "2.0": + global_rank = torch.distributed.get_global_rank(group, rank) + _HCOMM_INFO = group._get_backend( + torch.device("npu")).get_hccl_comm_name(global_rank) + + else: + _HCOMM_INFO = group.get_hccl_comm_name(rank) + return _HCOMM_INFO + + def forward( + self, input_: torch.Tensor + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + """Forward pass for the AscendRowParallelLinear layer. + + Args: + input_ (torch.Tensor): the input tensor to the layer. + + Returns: + Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + The output tensor after applying the linear transformation, + and optionally the bias if `return_bias` is True. + """ + input_parallel = self.calc_input(input_) + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + output = self.calc_output(input_parallel) + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + def calc_input(self, input_: torch.Tensor) -> torch.Tensor: + """Calculate the input tensor for parallel processing. + + Args: + input_ (torch.Tensor): the input tensor to be processed. + + Returns: + torch.Tensor: The input tensor split along the last dimension + for tensor model parallelism, or the original input if not parallel. + """ + if self.input_is_parallel: + return input_ + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + return splitted_input[tp_rank].contiguous() + + def calc_output(self, input_parallel: torch.Tensor) -> torch.Tensor: + """Calculate the output tensor of forward by considering + fusing communication and computation. + + Args: + input_parallel (_type_): the input tensor to be processed in parallel. + + Returns: + torch.Tensor: the output tensor after applying the linear transformation + and optionally handle communication between tensor model parallel ranks. + """ + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + if self.reduce_results and self.tp_size > 1: + output = torch_npu.npu_mm_all_reduce_base(input_parallel, + self.weight_t, + self.hcomm_info, + bias=bias_) + else: + output = self.quant_method.apply(self, input_parallel, bias=bias_) + return output + + +if envs_ascend.VLLM_ASCEND_ENABLE_MATMUL_ALLREDUCE: + logger.info("AscendRowParallelLinear: Matmul all-reduce is enabled. ") + vllm.model_executor.layers.linear.RowParallelLinear = AscendRowParallelLinear diff --git a/vllm_ascend/patch/worker/patch_common/patch_logits.py b/vllm_ascend/patch/worker/patch_common/patch_logits.py new file mode 100644 index 0000000..84a92f9 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_logits.py @@ -0,0 +1,26 @@ +import torch +import vllm +from vllm._custom_ops import apply_repetition_penalties_torch + + +def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor) -> None: + """Apply repetition penalties to logits in-place. + + Args: + logits: The logits tensor of shape [num_seqs, vocab_size]. + prompt_mask: A boolean tensor indicating which tokens appear in the prompt. + output_mask: A boolean tensor indicating which tokens appear in the output. + repetition_penalties: The repetition penalties of shape (num_seqs, ). + """ + apply_repetition_penalties_torch(logits, prompt_mask, output_mask, + repetition_penalties) + + +# NPU device type tensors have attributes is_cuda=True and is_npu=True, according to its implementation in +# https://github.com/Ascend/pytorch/blob/863b9071cbdf47023c12c246e3efa9c6e2285fc6/torch_npu/npu/_stream_check.py#L74 +# This causes that vLLM's apply_repetition_penalties function will run into the branch of "if logits.is_cuda" and +# call the custom op implemented in CUDA, which is not compatible with NPU. +# Reference: https://github.com/vllm-project/vllm/blob/f66673a39d9f364194c249f28098cad8a5584ccb/vllm/_custom_ops.py#L314 +vllm._custom_ops.apply_repetition_penalties = apply_repetition_penalties diff --git a/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py b/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py new file mode 100644 index 0000000..02d5804 --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_lora_embedding.py @@ -0,0 +1,29 @@ +from typing import Optional + +import vllm +from torch import nn +from transformers import PretrainedConfig +from vllm.config import LoRAConfig +from vllm.lora.layers import VocabParallelEmbeddingWithLoRA +from vllm.lora.utils import _all_lora_classes + +from vllm_ascend.ops.vocab_parallel_embedding import \ + AscendVocabParallelEmbedding + + +class AscendVocabParallelEmbeddingWithLoRA(VocabParallelEmbeddingWithLoRA): + + @classmethod + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: Optional[PretrainedConfig], + ) -> bool: + return type(source_layer) is AscendVocabParallelEmbedding + + +# Patch for lora register_model issue after overriding VocabParallelEmbedding class (#2515) +_all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) +vllm.lora.utils._all_lora_classes = _all_lora_classes diff --git a/vllm_ascend/patch/worker/patch_common/patch_minicpm.py b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py new file mode 100644 index 0000000..663a08a --- /dev/null +++ b/vllm_ascend/patch/worker/patch_common/patch_minicpm.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +from vllm.model_executor.models.minicpm import MiniCPMAttention + + +def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, +) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +# The type conversion in the forward function is deleted to support the rope operator. +MiniCPMAttention.forward = forward diff --git a/vllm_ascend/patch/worker/patch_main/__init__.py b/vllm_ascend/patch/worker/patch_main/__init__.py new file mode 100644 index 0000000..2ed088b --- /dev/null +++ b/vllm_ascend/patch/worker/patch_main/__init__.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# \ No newline at end of file diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py new file mode 100644 index 0000000..57ace2b --- /dev/null +++ b/vllm_ascend/platform.py @@ -0,0 +1,345 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import gc +from datetime import timedelta +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import vllm.envs as envs_vllm +from torch.distributed import ProcessGroup +from torch.distributed.distributed_c10d import PrefixStore +from vllm.logger import logger +from vllm.platforms import Platform, PlatformEnum + +from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config, + init_ascend_config) +from vllm_ascend.torchair.utils import (check_torchair_cache_exist, + delete_torchair_cache_file) +from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, is_310p, + update_aclgraph_sizes) + +if TYPE_CHECKING: + from vllm.config import ModelConfig, VllmConfig + from vllm.utils import FlexibleArgumentParser +else: + ModelConfig = None + VllmConfig = None + FlexibleArgumentParser = None + + +class NPUPlatform(Platform): + + _enum = PlatformEnum.OOT + device_name: str = "npu" + device_type: str = "npu" + simple_compile_backend: str = "eager" # Disable torch.compile() + ray_device_key: str = "NPU" + device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES" + dispatch_key: str = "PrivateUse1" + + supported_quantization: list[str] = [ASCEND_QUANTIZATION_METHOD] + + def is_sleep_mode_available(self) -> bool: + return True + + @classmethod + def pre_register_and_update(cls, + parser: Optional[FlexibleArgumentParser] = None + ) -> None: + # Adapt the global patch here. + from vllm_ascend.utils import adapt_patch + adapt_patch(is_global_patch=True) + + # For online serving, "ascend" quantization method is not a choice natively, + # so we need to add "ascend" quantization method to quantization methods list + # and the user can enable quantization using "vllm serve --quantization ascend". + if parser is not None: + quant_action = parser._option_string_actions.get('--quantization') + if quant_action and hasattr(quant_action, + 'choices') and quant_action.choices: + if ASCEND_QUANTIZATION_METHOD not in quant_action.choices: + quant_action.choices.append(ASCEND_QUANTIZATION_METHOD) + + from vllm_ascend.quantization.quant_config import \ + AscendQuantConfig # noqa: F401 + + @classmethod + def get_device_capability(cls, device_id: int = 0): + return None + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return torch.npu.get_device_name(device_id) + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + + @classmethod + def inference_mode(cls): + return torch.inference_mode() + + @classmethod + def set_device(cls, device: torch.device): + torch.npu.set_device(device) + + @classmethod + def empty_cache(cls): + torch.npu.empty_cache() + + @classmethod + def synchronize(cls): + torch.npu.synchronize() + + @classmethod + def mem_get_info(cls) -> Tuple[int, int]: + return torch.npu.mem_get_info() + + @classmethod + def clear_npu_memory(cls): + gc.collect() + torch.npu.empty_cache() + torch.npu.reset_peak_memory_stats() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + if not envs_vllm.VLLM_USE_V1: + raise ValueError("vLLM Ascend does not support V0 engine.") + # initialize ascend config from vllm additional_config + ascend_config = init_ascend_config(vllm_config) + + from vllm.config import CompilationLevel # noqa: E402 + compilation_config = vllm_config.compilation_config + model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config + cache_config = vllm_config.cache_config + kv_cache_dtype = vllm_config.additional_config.get( + "kv_cache_dtype", None) + if kv_cache_dtype is not None: + vllm_config.cache_config.cache_dtype = kv_cache_dtype + + if model_config is None: + logger.warning("Model config is missing. This may indicate " + "that we are running a test case") + enforce_eager = False + else: + enforce_eager = getattr(model_config, "enforce_eager", False) + + check_ascend_config(vllm_config, enforce_eager) + from vllm.config.compilation import CUDAGraphMode + if enforce_eager: + logger.info("Compilation disabled, using eager mode by default") + compilation_config.level = CompilationLevel.NO_COMPILATION + + compilation_config.cudagraph_num_of_warmups = 1 + + # TODO: make vllm support oot platform to set `compilation_config.cudagraph_mode` + # if cudagraph_mode is not explicitly set by users, set default value + if compilation_config.level == CompilationLevel.PIECEWISE: + compilation_config.cudagraph_mode = \ + CUDAGraphMode.PIECEWISE + elif compilation_config.level not in [ + CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE + ]: + logger.warning( + "NPU does not support %s compilation level. Setting CUDAGraphMode to NONE", + compilation_config.level) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + else: + logger.warning( + "compilation_config.level = CompilationLevel.NO_COMPILATION is set, Setting CUDAGraphMode to NONE" + ) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + # set CUDAGraphMode to None when torchair is enabled, no mather what compilation_config.level is. + if ascend_config.torchair_graph_config.enabled: + logger.info( + "Torchair compilation enabled on NPU. Setting CUDAGraphMode to NONE" + ) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + # Note: We delete the torchair cache folder here to prevent runtime issues caused by dimension + # mismatches or configuration inconsistencies when users reuse cached computation graphs. Though + # this will increase graph compilation duration, it significantly enhances robustness and decreases + # graph launching time during inference. + if check_torchair_cache_exist( + ) and not ascend_config.torchair_graph_config.use_cached_kv_cache_bytes: + logger.warning( + "Torchair cache folder is deleted here to prevent runtime issues caused by dimension " + "mismatches or configuration inconsistencies when users reuse cached computation graphs. " + "In order to decrease torchair graph compilation time, users can enable both use_cached_graph " + "and use_cached_kv_cache_bytes in torchair_graph_config.") + delete_torchair_cache_file() + + if parallel_config.distributed_executor_backend == "ray": + logger.warning( + "Ray distributed executor backend is not compatible with ACL Graph mode " + "right now. Setting CUDAGraphMode to NONE") + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + + # set cudaprah sizes before extending `compilation_config.splitting_ops` + vllm_config._set_cudagraph_sizes() + + if compilation_config.cudagraph_mode == CUDAGraphMode.NONE: + compilation_config.level = CompilationLevel.NO_COMPILATION + elif compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.info( + "PIECEWISE compilation enabled on NPU. use_inductor not supported - " + "using only ACL Graph mode") + assert compilation_config.level == CompilationLevel.PIECEWISE, \ + "When enabling piecewise aclgraph, please make sure compilation_config.level == CompilationLevel.PIECEWISE and compilation_config.cudagraph_mode == CUDAGraphMode.PIECEWISE" + compilation_config.set_splitting_ops_for_v1() + compilation_config.use_inductor = False + compilation_config.splitting_ops.extend( + ["vllm.unified_ascend_attention_with_output"]) + update_aclgraph_sizes(vllm_config) + else: + logger.info( + "%s cudagraph_mode is not support on NPU. falling back to NONE", + compilation_config.cudagraph_mode) + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + compilation_config.level = CompilationLevel.NO_COMPILATION + + if parallel_config and parallel_config.worker_cls == "auto": + if ascend_config.torchair_graph_config.enabled: + parallel_config.worker_cls = "vllm_ascend.torchair.torchair_worker.NPUTorchairWorker" + else: + parallel_config.worker_cls = "vllm_ascend.worker.worker_v1.NPUWorker" + + if cache_config: + if cache_config.block_size is None: + cache_config.block_size = 128 + if cache_config.enable_prefix_caching and cache_config.block_size != 128: + logger.warning( + "If prefix caching is enabled, block size must be set to 128." + ) + cache_config.block_size = 128 + + # Activate custom ops for v1, except on 310P + if not is_310p(): + compilation_config.custom_ops = ["all"] + + # If ascend_scheduler_config is enabled, + # extents original scheduler_config to use AscendScheduler. + if ascend_config.ascend_scheduler_config.enabled: + from vllm_ascend.core.schedule_config import AscendSchedulerConfig + ascend_scheduler_config = AscendSchedulerConfig.initialize_from_config( + vllm_config.scheduler_config, + ascend_config.ascend_scheduler_config) + vllm_config.scheduler_config = ascend_scheduler_config + + if compilation_config.pass_config.enable_sequence_parallelism: + if not parallel_config.enable_expert_parallel or vllm_config.model_config.hf_config.model_type != "qwen3_moe": + raise NotImplementedError( + "For better performance in Qwen3 MoE, SP only works exclusively with MC2, AllToAll, and AllToAllV." + ) + + @classmethod + def get_attn_backend_cls(cls, + selected_backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + use_v1, + use_mla, + has_sink=False): + if not use_v1: + raise ValueError("vLLM Ascend does not support V0 engine.") + + use_torchair = get_ascend_config().torchair_graph_config.enabled + # choose attention backend based on use_mla and use_torchair + backend_map = { + (True, True): + "vllm_ascend.torchair.torchair_mla.AscendMLATorchairBackend", + (True, False): + "vllm_ascend.attention.mla_v1.AscendMLABackend", + (False, True): + "vllm_ascend.torchair.torchair_attention.AscendAttentionTorchairBackend", + (False, False): + "vllm_ascend.attention.attention_v1.AscendAttentionBackend" + } + return backend_map[(use_mla, use_torchair)] + + @classmethod + def get_punica_wrapper(cls) -> str: + return "vllm_ascend.lora.punica_wrapper.punica_npu.PunicaWrapperNPU" + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.npu.reset_peak_memory_stats(device) + return torch.npu.max_memory_allocated(device) + + @classmethod + def get_device_communicator_cls(cls) -> str: + return "vllm_ascend.distributed.communicator.NPUCommunicator" + + @classmethod + def is_pin_memory_available(cls): + return True + + @classmethod + def supports_v1(cls, model_config: ModelConfig) -> bool: + """Returns whether the current platform can support v1 for the supplied + model configuration. + """ + return True + + @classmethod + def get_static_graph_wrapper_cls(cls) -> str: + """ + Get piecewise backend class for piecewise graph. + """ + return "vllm_ascend.compilation.acl_graph.ACLGraphWrapper" # noqa + + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + from torch.distributed import is_hccl_available + from torch_npu._C._distributed_c10d import ProcessGroupHCCL + + assert is_hccl_available() + + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + + backend_options = ProcessGroupHCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupHCCL(prefix_store, group_rank, group_size, + backend_options) + device = torch.device("npu") + # TODO(Yizhou): Like we mentioned above, _set_default_backend is not + # implemented in the 2.5.1 version of PyTorch. But we need to set it + # after the latest version is released. + # pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + backend_type = ProcessGroup.BackendType.CUSTOM + + pg._register_backend(device, backend_type, backend_class) + return pg diff --git a/vllm_ascend/quantization/__init__.py b/vllm_ascend/quantization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/quantization/func_wrapper.py b/vllm_ascend/quantization/func_wrapper.py new file mode 100644 index 0000000..8357695 --- /dev/null +++ b/vllm_ascend/quantization/func_wrapper.py @@ -0,0 +1,184 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Tuple, Union + +import torch +import torch_npu +from vllm.logger import logger +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, QuantizationConfig) + + +# func refers to vocabParallelEmbedding.__init__ +def wrapper_vocab_parallel_embedding_init(func): + + def init( + self, + num_embeddings: int, + embedding_dim: int, + params_dtype: Optional[torch.dtype] = None, + org_num_embeddings: Optional[int] = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + func( + self, + num_embeddings, + embedding_dim, + params_dtype, + org_num_embeddings, + padding_size, + quant_config, + prefix, + ) + # TODO: Contact vLLM maintainers to add a `params_dtype` attribute to the `VocabParallelEmbedding` class. + if params_dtype is None: + params_dtype = torch.get_default_dtype() + self.params_dtype = params_dtype + + return init + + +# func refers to RMSNorm.__init__ +def wrapper_rmsnorm_init(func): + + def init(self, hidden_size: int, **extra_args) -> None: + func(self, hidden_size, **extra_args) + self.ignore_anti = True + self.bias = torch.nn.Parameter(torch.zeros(hidden_size), + requires_grad=False) + + return init + + +# func refers to RMSNorm.forward_oot +def wrapper_rmsnorm_forward_oot(func): + + def _rmsnorm_forward_oot( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if not self.ignore_anti: + if residual is not None: + residual += x + out = torch_npu._npu_quant_rms_norm( + residual, + self.weight, + self.bias, + self.input_scale, + self.input_offset, + self.variance_epsilon, + ) + return out, residual + out = torch_npu._npu_quant_rms_norm( + x, + self.weight, + self.bias, + self.input_scale, + self.input_offset, + self.variance_epsilon, + ) + return out + + if residual is not None: + x, residual = func(self, x, residual) + return x.add_(self.bias), residual + + return func(self, x).add_(self.bias) + + return _rmsnorm_forward_oot + + +MODEL_LAYER_MAPPING = { + "LlamaModel": { + "attn": { + "layer_attr": "self_attn", + "proj_attr": "qkv_proj", + "norm_attr": "input_layernorm", + "unquantized_type": UnquantizedLinearMethod, + }, + "mlp": { + "layer_attr": "mlp", + "proj_attr": "gate_up_proj", + "norm_attr": "post_attention_layernorm", + "unquantized_type": UnquantizedLinearMethod, + }, + }, +} + + +def wrapper_load_model(func): + + def postprocess_loading(self) -> None: + func(self) + + def process_layer(layer, idx, mapping): + + def process_module(module_cfg, layer_obj): + if module_cfg is None: + return + + module_obj = getattr(layer_obj, module_cfg["layer_attr"], None) + if module_obj is None: + return + + proj_attr = module_cfg["proj_attr"] + if callable(proj_attr): + proj = proj_attr(module_obj, idx) + else: + proj = getattr(module_obj, proj_attr, None) + + norm = getattr(layer_obj, module_cfg["norm_attr"], None) + + if proj is None or norm is None: + return + + norm.ignore_anti = isinstance(proj.quant_method, + module_cfg["unquantized_type"]) + if not norm.ignore_anti: + for param_name in ["input_scale", "input_offset"]: + if hasattr(proj, param_name): + param = getattr(proj, param_name) + norm.register_parameter( + param_name, + torch.nn.Parameter(param.clone(), + requires_grad=False)) + + process_module(mapping.get("attn"), layer) + process_module(mapping.get("mlp"), layer) + + model_type = self.model.model.__class__.__name__ + mapping = MODEL_LAYER_MAPPING.get(model_type) + + if not mapping: + logger.info( + f"Warning: Model type '{model_type}' not found in MODEL_LAYER_MAPPING. Skipping layer mapping." + ) + return + + for idx, layer in enumerate(self.model.model.layers): + process_layer(layer, idx, mapping) + + if isinstance(self.model.model.norm, RMSNorm): + self.model.model.norm.ignore_anti = True + + return postprocess_loading diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py new file mode 100644 index 0000000..d449c8d --- /dev/null +++ b/vllm_ascend/quantization/quant_config.py @@ -0,0 +1,357 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# +from types import MappingProxyType +from typing import Any, Callable, Dict, List, Mapping, Optional + +import torch +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, + FusedMoeWeightScaleSupported) +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + RowParallelLinear, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization import \ + register_quantization_config +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.layers.vocab_parallel_embedding import ( + UnquantizedEmbeddingMethod, VocabParallelEmbedding) +from vllm.model_executor.parameter import PerTensorScaleParameter +from vllm.model_executor.utils import set_weight_attrs + +from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD + +from .quantizer import AscendQuantizer + + +@register_quantization_config(ASCEND_QUANTIZATION_METHOD) +class AscendQuantConfig(QuantizationConfig): + """Config class for Ascend + + This class is a general class that parse quantization configs + that are supported on ascend hardware. + """ + + def __init__(self, quant_config: Dict[str, Any]): + self.quant_description = quant_config + + def __repr__(self) -> str: + return "AscendQuantConfig:\n" + super().__repr__() + + @classmethod + def get_name(cls) -> str: + return ASCEND_QUANTIZATION_METHOD + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.int8, torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "Ascend hardware dose not support \"get_min_capability\" feature.") + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quant_model_description.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "AscendQuantConfig": + return cls(config) + + @classmethod + def override_quantization_method(cls, hf_quant_cfg, + user_quant) -> Optional[str]: + if torch.npu.is_available(): + return ASCEND_QUANTIZATION_METHOD + return None + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + from vllm.attention.layer import Attention + if isinstance(layer, LinearBase): + if self.is_layer_skipped_ascend(prefix, + self.packed_modules_mapping): + return UnquantizedLinearMethod() + return AscendLinearMethod(self, prefix, + self.packed_modules_mapping) + elif isinstance(layer, Attention) and \ + 'fa_quant_type' in self.quant_description.keys() and \ + self.quant_description['fa_quant_type'] is not None: + return AscendKVCacheMethod(self, prefix) + elif isinstance(layer, Attention) and self.quant_description.get( + 'kv_quant_type') == 'C8': + return AscendKVCacheMethod(self, prefix) + elif isinstance(layer, FusedMoE): + if self.is_layer_skipped_ascend(prefix, + self.packed_modules_mapping): + return AscendUnquantizedFusedMoEMethod(layer.moe_config) + return AscendFusedMoEMethod(self, prefix, + self.packed_modules_mapping) + elif isinstance(layer, VocabParallelEmbedding): + if self.is_layer_skipped_ascend(prefix, + self.packed_modules_mapping): + return UnquantizedEmbeddingMethod() + return AscendEmbeddingMethod(self, prefix, + self.packed_modules_mapping) + return None + + def is_layer_skipped_ascend( + self, + prefix: str, + fused_mapping: Mapping[str, List[str]] = MappingProxyType({})): + # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped + proj_name = prefix.split(".")[-1] + if proj_name in fused_mapping: + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in fused_mapping[proj_name] + ] + + is_skipped = None + for shard_prefix in shard_prefixes: + is_shard_skipped = self.quant_description[shard_prefix + + '.weight'] == "FLOAT" + + if is_skipped is None: + is_skipped = is_shard_skipped + elif is_shard_skipped != is_skipped: + raise ValueError( + f"Detected some but not all shards of {prefix} " + "are quantized. All shards of fused layers " + "to have the same precision.") + else: + is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT" + + assert is_skipped is not None + return is_skipped + + def get_scaled_act_names(self) -> List[str]: + return [] + + +class AscendLinearMethod(LinearMethodBase): + """Linear method for Ascend quantization. + + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for linear methods. + + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str, + packed_modules_mapping: Dict[str, Any]) -> None: + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix, packed_modules_mapping) + self.quant_method = self.quantizer.build_linear_method() + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + weight_dict = self.quant_method.get_weight(input_size_per_partition, + output_size_per_partition, + params_dtype) + for weight_name, weight_param in weight_dict.items(): + param = torch.nn.Parameter(weight_param, requires_grad=False) + set_weight_attrs(param, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter(weight_name, param) + set_weight_attrs(param, extra_weight_attrs) + + pertensor_dict = self.quant_method.get_pertensor_param(params_dtype) + for pertensor_name, pertensor_param in pertensor_dict.items(): + param = PerTensorScaleParameter(data=pertensor_param, + weight_loader=weight_loader) + # disable warning + param.ignore_warning = True + layer.register_parameter(pertensor_name, param) + + perchannel_dict = self.quant_method.get_perchannel_param( + output_size_per_partition, params_dtype) + for perchannel_name, perchannel_param in perchannel_dict.items(): + param = torch.nn.Parameter(perchannel_param, requires_grad=False) + set_weight_attrs(param, {"output_dim": 0}) + layer.register_parameter(perchannel_name, param) + set_weight_attrs(param, extra_weight_attrs) + + pergroup_dict = self.quant_method.get_pergroup_param( + input_size_per_partition, output_size_per_partition, params_dtype) + for pergroup_name, pergroup_param in pergroup_dict.items(): + param = torch.nn.Parameter(pergroup_param, requires_grad=False) + set_weight_attrs(param, {"output_dim": 0}) + layer.register_parameter(pergroup_name, param) + set_weight_attrs(param, extra_weight_attrs) + if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name: + setattr(param, "input_dim", 1) + param.input_dim = 1 + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if hasattr(self.quant_method, "process_weights_after_loading"): + self.quant_method.process_weights_after_loading(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if isinstance(layer, RowParallelLinear): + tp_rank = get_tensor_model_parallel_rank() + return self.quant_method.apply(layer, x, bias, tp_rank) + return self.quant_method.apply(layer, x, bias) + + +class AscendKVCacheMethod(BaseKVCacheMethod): + """KVCache method for Ascend quantization. + + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for kvcache methods. + + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str) -> None: + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix) + self.quant_method = self.quantizer.build_attention_method() + + def create_weights(self, layer: torch.nn.Module) -> None: + # Different from linear method, there are no weight processing/slicing + # steps for attention in vllm. So the whole process of create weights + # is hidden into the specific quant method. + self.quant_method.create_weights(layer) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if hasattr(self.quant_method, "process_weights_after_loading"): + self.quant_method.process_weights_after_loading(layer) + + def apply(self, layer: torch.nn.Module, query: torch.Tensor, + key: torch.Tensor, value: torch.Tensor, kv_cache, attn_metadata, + attn_type, scale, output) -> torch.Tensor: + return self.quant_method.apply(layer, query, key, value, kv_cache, + attn_metadata, attn_type, scale, output) + + +class AscendFusedMoEMethod(FusedMoEMethodBase): + """FusedMoE method for Ascend quantization. + + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for kvcache methods. + + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str, + packed_modules_mapping: Dict[str, Any]): + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix, packed_modules_mapping) + self.quant_method = self.quantizer.build_moe_method() + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + weight_param = self.quant_method.get_weight( + num_experts, intermediate_size_per_partition, hidden_size, + params_dtype) + for param_key, param_value in weight_param.items(): + param = torch.nn.Parameter(param_value, requires_grad=False) + layer.register_parameter(param_key, param) + set_weight_attrs(param, extra_weight_attrs) + + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}) + per_group_param = [ + "weight_scale_second", "weight_offset_second", "scale_bias" + ] + dynamic_quant_param = self.quant_method.get_dynamic_quant_param( + num_experts, intermediate_size_per_partition, hidden_size, + params_dtype) + for param_key, param_value in dynamic_quant_param.items(): + param = torch.nn.Parameter(param_value, requires_grad=False) + layer.register_parameter(param_key, param) + set_weight_attrs(param, extra_weight_attrs) + if any(fields in param_key for fields in per_group_param): + setattr(param, "quant_method", + FusedMoeWeightScaleSupported.GROUP.value) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = False, + log2phy: torch.Tensor = None, + global_redundant_expert_num=0, + **kwargs, + ) -> torch.Tensor: + return self.quant_method.apply( + layer, x, router_logits, top_k, renormalize, use_grouped_topk, + global_num_experts, expert_map, topk_group, num_expert_group, + custom_routing_function, scoring_func, e_score_correction_bias, + is_prefill, enable_force_load_balance, log2phy, + global_redundant_expert_num, **kwargs) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + if hasattr(self.quant_method, "process_weights_after_loading"): + self.quant_method.process_weights_after_loading(layer) + + +class AscendEmbeddingMethod(AscendLinearMethod): + """Embedding method for Ascend quantization. + This class calls AscendQuantizer to search a specific quantization + implementations supported on ascend hardware for Embedding methods. + Args: + quant_config: The Ascend quantization config. + """ + + def __init__(self, quant_config: AscendQuantConfig, prefix: str, + packed_modules_mapping: Dict[str, Any]) -> None: + self.quantizer = AscendQuantizer.get_quantizer( + quant_config.quant_description, prefix, packed_modules_mapping) + self.quant_method = self.quantizer.build_linear_method() diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py new file mode 100644 index 0000000..0e15ed2 --- /dev/null +++ b/vllm_ascend/quantization/quantizer.py @@ -0,0 +1,311 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import importlib +import sys +import types +from typing import Any, Dict, List, Optional + +from vllm.logger import logger + +from .func_wrapper import (wrapper_rmsnorm_forward_oot, wrapper_rmsnorm_init, + wrapper_vocab_parallel_embedding_init) +from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod, + AscendW4A8DynamicLinearMethod) +from .w8a8 import (AscendC8KVCacheMethod, AscendW8A8FusedMoEMethod, + AscendW8A8LinearMethod) +from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod, + AscendW8A8DynamicLinearMethod) + +CUSTOMIZED_QUANTIZER_TYPE: List[str] = [] + + +class AscendQuantizer: + """An interface to different quantization implementations for ascend hardwares.""" + + @classmethod + def get_quantizer(cls, + quant_config: Dict[str, Any], + prefix: str, + packed_modules_mapping: Optional[Dict[str, + Any]] = dict()): + # TODO: Need a param to choose quantization algorithms. + quantization_algorithm = '' + + if quantization_algorithm in CUSTOMIZED_QUANTIZER_TYPE: + return + + return VLLMAscendQuantizer.get_quantizer(quant_config, prefix, + packed_modules_mapping) + + def build_linear_method(self): + raise NotImplementedError + + def build_moe_method(self): + raise NotImplementedError + + def build_attention_method(self): + raise NotImplementedError + + +class VLLMAscendQuantizer: + _instance: Optional[object] = None + patched = False + + def __init__(self, quant_description): + if VLLMAscendQuantizer.patched: + return + for name in quant_description.keys(): + if "norm.bias" in name: + VLLMAscendQuantizer.apply_patch( + "vllm.model_executor.layers.layernorm.RMSNorm", "__init__", + [wrapper_rmsnorm_init]) + VLLMAscendQuantizer.apply_patch( + "vllm_ascend.ops.layernorm.AscendRMSNorm", "forward_oot", + [wrapper_rmsnorm_forward_oot]) + VLLMAscendQuantizer.apply_patch( + "vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding", + "__init__", [wrapper_vocab_parallel_embedding_init]) + break + VLLMAscendQuantizer.patched = True + logger.info("Using the vLLM Ascend Quantizer version now!") + + @staticmethod + def apply_patch(target_module, target_function, wrappers): + + original_module, original_function = VLLMAscendQuantizer.parse_path( + target_module, target_function, False) + + original_function_id = id(original_function) + + candidate = original_function + for wrapper in wrappers: + candidate = wrapper(candidate) + if target_function is not None: + setattr(original_module, target_function, candidate) + + for _, value in sys.modules.copy().items(): + if target_function is None: + continue + try: + attr = getattr(value, target_function, None) + if attr is not None and id(attr) == original_function_id: + setattr(value, target_function, candidate) + except ImportError: + continue + + @staticmethod + def parse_path(module_path, function_name, create_dummy): + """ + Parse module path and resolve/create modules as needed. + + Args: + module_path: Dot-separated module path + function_name: Target function name (None for module only) + create_dummy: Create dummy modules/functions when missing + + Returns: + Tuple of (resolved module, target function/none) + + Raises: + ModuleNotFoundError: If module path is invalid and create_dummy=False + AttributeError: If function is missing and create_dummy=False + """ + from importlib.machinery import ModuleSpec + + def create_dummy_module(full_path, parent=None): + """Create and register a placeholder module""" + dummy = types.ModuleType(full_path) + dummy.__file__ = "vllm_ascend.dummy_module.py" + dummy.__spec__ = ModuleSpec(full_path, None) + sys.modules[full_path] = dummy + if parent: + setattr(parent, full_path.split(".")[-1], dummy) + return dummy + + def create_placeholder_function(func_name): + """Create dummy function that raises when called""" + + def placeholder(*args, **kwargs): + raise NotImplementedError( + f"Function {func_name} is a placeholder") + + placeholder.__name__ = func_name + return placeholder + + modules = module_path.split(".") + current_module = None + processed_path = [] + + for idx, part in enumerate(modules): + current_path = ".".join(modules[:idx + 1]) + parent_path = ".".join(modules[:idx]) if idx > 0 else None + + try: + current_module = importlib.import_module(current_path) + except ModuleNotFoundError: + # Handle missing module + parent = importlib.import_module( + parent_path) if parent_path else None + if parent and hasattr(parent, part): + # Use existing attribute from parent + current_module = getattr(parent, part) + # Check for early function resolution + if function_name and hasattr(current_module, + function_name): + return current_module, getattr(current_module, + function_name) + if function_name and create_dummy: + ph_func = create_placeholder_function(function_name) + setattr(current_module, function_name, ph_func) + return current_module, ph_func + if function_name: + raise AttributeError( + f"Function {function_name} missing in {current_path}" + ) + else: + if not create_dummy: + raise + # Create and register dummy module + current_module = create_dummy_module( + current_path, + parent=importlib.import_module(parent_path) + if parent_path else None) + + processed_path.append(part) + + # Final function handling + final_module = sys.modules[module_path] + if function_name is not None: + if not hasattr(final_module, function_name): + if create_dummy: + ph_func = create_placeholder_function(function_name) + setattr(final_module, function_name, ph_func) + else: + setattr(final_module, function_name, None) + return final_module, getattr(final_module, function_name) + + return final_module, None + + @staticmethod + def build_linear_method(): + raise NotImplementedError( + "Linear method is not implemented for the current quant type.") + + @staticmethod + def build_moe_method(): + raise NotImplementedError( + "MoE method is not implemented for the current quant type.") + + @staticmethod + def build_attention_method(): + raise NotImplementedError( + "Attention method is not implemented for the current quant type.") + + @staticmethod + def get_linear_quant_type(quant_description: Dict[str, Any], prefix: str, + packed_modules_mapping: Dict[str, Any]): + proj_name = prefix.split(".")[-1] + if proj_name in packed_modules_mapping: + quant_type = None + shard_prefixes = [ + prefix.replace(proj_name, shard_proj_name) + for shard_proj_name in packed_modules_mapping[proj_name] + ] + for shard_prefix in shard_prefixes: + shard_quant_type = quant_description[shard_prefix + '.weight'] + + if quant_type is None: + quant_type = shard_quant_type + elif shard_quant_type != quant_type: + raise ValueError( + f"Not all shards of {prefix} are quantized with same quant type." + f"Shard {proj_name} uses {shard_quant_type}, but another shard" + f"use {quant_type}. Please check quantization config.") + else: + quant_type = quant_description[prefix + '.weight'] + return quant_type + + @classmethod + def get_quantizer(cls, + quant_description: Dict[str, Any], + prefix: str, + packed_modules_mapping: Optional[Dict[str, Any]] = None): + if packed_modules_mapping is None: + packed_modules_mapping = dict() + # Attention + if '.attn' in prefix and 'fa_quant_type' in quant_description.keys(): + quant_type = quant_description['fa_quant_type'] + # Use KVCache int8 + elif '.attn' in prefix and 'kv_quant_type' in quant_description.keys(): + quant_type = quant_description['kv_quant_type'] + # Linear + else: + quant_type = cls.get_linear_quant_type(quant_description, prefix, + packed_modules_mapping) + if quant_type in SUPPORT_ASCEND_QUANTIZER_TYPE.keys(): + cls = SUPPORT_ASCEND_QUANTIZER_TYPE[quant_type] + if not cls._instance: + cls._instance = cls(quant_description) + return cls._instance + raise NotImplementedError("Currently, vLLM Ascend only supports following quant types:" \ + f"{list(SUPPORT_ASCEND_QUANTIZER_TYPE.keys())}") + + +class W4A8DYNAMICQuantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return AscendW4A8DynamicLinearMethod() + + @staticmethod + def build_moe_method(): + return AscendW4A8DynamicFusedMoEMethod() + + +class W8A8Quantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return AscendW8A8LinearMethod() + + @staticmethod + def build_moe_method(): + return AscendW8A8FusedMoEMethod() + + @staticmethod + def build_attention_method(): + return AscendC8KVCacheMethod() + + +class W8A8DYNAMICQuantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return AscendW8A8DynamicLinearMethod() + + @staticmethod + def build_moe_method(): + return AscendW8A8DynamicFusedMoEMethod() + + +SUPPORT_ASCEND_QUANTIZER_TYPE = { + "W4A8_DYNAMIC": W4A8DYNAMICQuantizer, + "W8A8": W8A8Quantizer, + "W8A8_DYNAMIC": W8A8DYNAMICQuantizer, + "C8": W8A8Quantizer, +} diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py new file mode 100644 index 0000000..72f956d --- /dev/null +++ b/vllm_ascend/quantization/w4a8_dynamic.py @@ -0,0 +1,394 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional + +import numpy as np +import torch +import torch_npu +from vllm.config import get_current_vllm_config +from vllm.distributed import get_ep_group +from vllm.forward_context import get_forward_context + +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.ops.fused_moe import unified_fused_experts_eager +from vllm_ascend.ops.layers.experts_selector import select_experts + + +class AscendW4A8DynamicLinearMethod: + """Linear method for Ascend W4A8_DYNAMIC + """ + + def __init__(self): + self.transpose_weight = True + try: + self.group_size = get_current_vllm_config( + ).quant_config.quant_description.get("group_size", 256) + except AttributeError: + self.group_size = 256 + + @staticmethod + def get_weight(input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def get_perchannel_param(output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_scale_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + params_dict["weight_offset_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + return params_dict + + @staticmethod + def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, + per_group_scale: torch.Tensor): + k, n = weight.shape + group_num, n = per_group_scale.shape + weight_high = weight.to(torch.float32).reshape( + group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) + weight_high = weight_high.reshape(k, n) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) + antiquant_scale = (scale * per_group_scale).reshape(group_num, n) + return antiquant_scale.npu(), bias + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = None, + ) -> torch.Tensor: + return torch_npu.npu_weight_quant_batchmatmul( + x, + layer.weight, + antiquant_scale=layer.weight_scale_second.to(x.dtype), + antiquant_group_size=self.group_size, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten().to( + torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight_scale_second.data, scale_bias = self.process_scale_second( + layer.weight.data, + layer.weight_scale.data, + layer.weight_scale_second.data.transpose(0, 1).contiguous(), + ) + param = torch.nn.Parameter(scale_bias, requires_grad=False) + layer.register_parameter("weight_scale_bias", param) + layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32)) + + +class AscendW4A8DynamicFusedMoEMethod: + """FusedMoe method for Ascend W4A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + self.ep_group = get_ep_group() + + vllm_config = get_current_vllm_config() + self.group_size = vllm_config.quant_config.quant_description.get( + "group_size", 256) + quant_version = vllm_config.quant_config.quant_description.get( + "version", "0") + # NOTE: new quantize weights: 2 int4 pack into int8 + self.new_quant_version = quant_version == "1.0.0" + self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size + if self.new_quant_version and self.tp_size > 16: + raise ValueError( + "The current weight does not support moe part tp>16.") + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = "" + + def get_weight(self, num_experts: int, + intermediate_size_per_partition: int, hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + if self.new_quant_version: + w13_output_size = intermediate_size_per_partition + w2_output_size = hidden_sizes // 2 + else: + w13_output_size = 2 * intermediate_size_per_partition + w2_output_size = hidden_sizes + + param_dict["w13_weight"] = torch.empty(num_experts, + w13_output_size, + hidden_sizes, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(num_experts, + w2_output_size, + intermediate_size_per_partition, + dtype=torch.int8) + return param_dict + + def get_dynamic_quant_param(self, num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_scale_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // self.group_size, + dtype=params_dtype) + + param_dict["w13_weight_offset_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // self.group_size, + dtype=params_dtype) + + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_scale_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // self.group_size, + dtype=params_dtype) + param_dict["w2_weight_offset_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // self.group_size, + dtype=params_dtype) + + if self.new_quant_version: + param_dict["w13_scale_bias"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32) + param_dict["w2_scale_bias"] = torch.empty(num_experts, + hidden_sizes, + 16 // self.tp_size, + dtype=torch.float32) + + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = True, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern + topk_weights, topk_ids, row_idx = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + fused_moe_state = get_forward_context().fused_moe_state + shared_gate_up, shared_dequant_scale = None, None + if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: + share_up_out, _ = shared_experts.gate_up_proj( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_gate_up, shared_dequant_scale = share_up_out[ + 0], share_up_out[1] + + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + + return unified_fused_experts_eager( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_second, + w2_scale=layer.w2_weight_scale_second, + w1_scale_bias=layer.w13_scale_bias, + w2_scale_bias=layer.w2_scale_bias, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=row_idx, + expert_map=expert_map, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + shared_gate_up=shared_gate_up, + shared_dequant_scale=shared_dequant_scale, + mc2_mask=kwargs.get("mc2_mask", None), + with_quant=True) + + def process_scale(self, weight: torch.Tensor, scale, per_group_scale): + group_num, k, n = weight.shape + # the weight of the new version is reduced by half by pack n, so it needs to be restored + if self.new_quant_version: + n = n * 2 + per_group_scale = per_group_scale.reshape(group_num, -1, n) + group_num, quantgroup_num, n = per_group_scale.shape + bias = None + if not self.new_quant_version: + weight_high = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \ + per_group_scale.reshape([group_num, quantgroup_num, 1, n]) + weight_high = weight_high.reshape([group_num, k, n]) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(axis=1) + scale_fp32 = (scale * per_group_scale).to(torch.float16).to( + torch.float32) + scale_fp32_np = scale_fp32.cpu().numpy() + scale_fp32_np.dtype = np.uint32 + sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), + dtype=np.uint32) + + sscale_uint64[..., ::2] = scale_fp32_np + + sscale_uint64_buffer = np.frombuffer(sscale_uint64.tobytes(), + dtype=np.int64).copy() + sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( + group_num, quantgroup_num, n) + sscale_uint64_tensor = sscale_uint64_tensor.npu() + return sscale_uint64_tensor, bias + + def update_bias(self, layer, w13_bias, w2_bias): + if self.new_quant_version: + layer.w13_scale_bias.data = layer.w13_scale_bias.data.transpose( + 1, 2).contiguous().sum(axis=1) + layer.w2_scale_bias.data = layer.w2_scale_bias.data.transpose( + 1, 2).contiguous().sum(axis=1) + else: + w13_scale_bias = torch.nn.Parameter(w13_bias, requires_grad=False) + layer.register_parameter("w13_scale_bias", w13_scale_bias) + w2_scale_bias = torch.nn.Parameter(w2_bias, requires_grad=False) + layer.register_parameter("w2_scale_bias", w2_scale_bias) + + def pack_to_int32(self, weight: torch.Tensor): + if self.new_quant_version: + group_num, k, n = weight.shape + assert n % 4 == 0, "the last dim of weight needs to be divided by 4" + packed_n = n // 4 + # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 + packed_weight = torch.from_numpy( + np.frombuffer(weight.cpu().numpy().tobytes(), dtype=np.int32)) + return packed_weight.reshape(group_num, k, packed_n).npu() + else: + return torch_npu.npu_quantize(weight.to(torch.float32), + torch.tensor([1.]).npu(), None, + torch.quint4x2, -1, False) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose( + 1, 2).contiguous() + + layer.w13_weight_scale_second.data, w13_bias = self.process_scale( + layer.w13_weight, layer.w13_weight_scale.data, + layer.w13_weight_scale_second.data) + layer.w2_weight_scale_second.data, w2_bias = self.process_scale( + layer.w2_weight, layer.w2_weight_scale.data, + layer.w2_weight_scale_second.data) + + self.update_bias(layer, w13_bias, w2_bias) + + layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py new file mode 100644 index 0000000..e4cbdc8 --- /dev/null +++ b/vllm_ascend/quantization/w8a8.py @@ -0,0 +1,647 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional + +import torch +import torch_npu +from vllm.attention.backends.abstract import AttentionType +from vllm.distributed.parallel_state import get_ep_group + +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p + + +def quant_per_tensor(in_tensor: torch.Tensor, + input_scale: torch.Tensor, + input_offset: torch.Tensor, + function=False): + return torch_npu.npu_quantize(in_tensor, input_scale, input_offset, + torch.qint8, -1, function) + + +class AscendW8A8LinearMethod: + """Linear method for Ascend W8A8. + + Args: + w_sym: whether the linear weight is symmetrically quantized. + """ + + def __init__(self) -> None: + # aclnn quant matmul requires to transpose matrix B, set to true by default. + self.transpose_weight = not is_310p() + + @staticmethod + def get_weight( + input_size: int, + output_size: int, + params_dtype: torch.dtype = torch.bfloat16, + ) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = {} + params_dict["input_scale"] = torch.empty(1, dtype=params_dtype) + params_dict["input_offset"] = torch.empty(1, dtype=torch.int8) + return params_dict + + @staticmethod + def get_perchannel_param( + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + params_dict = {} + params_dict["quant_bias"] = torch.empty(output_size, dtype=torch.int32) + if params_dtype == torch.bfloat16: + params_dict["deq_scale"] = torch.empty(output_size, + dtype=torch.float32) + elif params_dtype == torch.float16: + params_dict["deq_scale"] = torch.empty(output_size, + dtype=torch.int64) + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + return params_dict + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def apply( + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, + ) -> torch.Tensor: + if x.dtype != torch.int8: + x = quant_per_tensor( + x, + layer.aclnn_input_scale_reciprocal, + layer.aclnn_input_offset, + ) + quant_bias = layer.quant_bias if tp_rank == 0 else None + if is_310p(): + # On 300I Duo platform, we need transpose again if + # using nz. This transpose can be skipped in torchair. + output = torch_npu.npu_quant_matmul( + x, + layer.weight.data.transpose(1, 0), + layer.deq_scale, + bias=quant_bias, + output_dtype=layer.params_dtype, + ) + else: + output = torch_npu.npu_quant_matmul( + x, + layer.weight, + layer.deq_scale, + bias=quant_bias, + output_dtype=layer.params_dtype, + ) + return output + + def process_weights_after_loading(self, layer): + expanding_factor = layer.weight.data.shape[1] + layer.aclnn_input_scale = torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor), + requires_grad=False) + layer.aclnn_input_scale_reciprocal = 1 / torch.nn.Parameter( + layer.input_scale.data.repeat(expanding_factor), + requires_grad=False) + layer.aclnn_input_offset = torch.nn.Parameter( + layer.input_offset.data.repeat(expanding_factor), + requires_grad=False).to(layer.aclnn_input_scale.dtype) + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, + ACL_FORMAT_FRACTAL_NZ) + layer.weight_scale.data = torch.flatten(layer.weight_scale.data) + layer.weight_offset.data = torch.flatten(layer.weight_offset.data) + + +class AscendW8A8FusedMoEMethod: + """FusedMoe method for Ascend W8A8. + """ + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def get_weight(num_experts: int, intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight"] = torch.empty(num_experts, + 2 * + intermediate_size_per_partition, + hidden_sizes, + dtype=torch.int8, + requires_grad=False) + param_dict["w2_weight"] = torch.empty(num_experts, + hidden_sizes, + intermediate_size_per_partition, + dtype=torch.int8, + requires_grad=False) + return param_dict + + @staticmethod + def get_dynamic_quant_param(num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32) + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float16) + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=torch.float32) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=torch.float16) + param_dict["w2_deq_scale"] = torch.empty(num_experts, + hidden_sizes, + dtype=torch.float32) + param_dict["w13_deq_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32) + param_dict["w2_input_scale"] = torch.empty(num_experts, + 1, + dtype=torch.float32) + param_dict["w13_input_scale"] = torch.empty(num_experts, + 1, + dtype=torch.float32) + param_dict["w2_input_offset"] = torch.empty(num_experts, + 1, + dtype=torch.int8) + param_dict["w13_input_offset"] = torch.empty(num_experts, + 1, + dtype=torch.int8) + param_dict["quant_bias"] = torch.empty(num_experts, + hidden_sizes, + dtype=torch.int32) + + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = False, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + topk_weights, topk_ids = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + if is_310p(): + return fused_experts_310p(hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w1_input_scale=layer.w13_input_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + w2_input_scale=layer.w2_input_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + expert_map=expert_map) + return fused_experts(hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w1_input_scale=layer.w13_input_scale, + w1_input_offset=layer.w13_input_offset, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + w2_input_scale=layer.w2_input_scale, + w2_input_offset=layer.w2_input_offset, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + global_num_experts=global_num_experts, + expert_map=expert_map) + + def process_weights_after_loading(self, layer): + if not is_310p(): + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( + layer.w13_weight_scale.data.shape[0], -1) + + layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( + layer.w13_weight_offset.data.shape[0], -1) + layer.w2_weight_scale.data = layer.w2_weight_scale.data.view( + layer.w2_weight_scale.data.shape[0], -1) + layer.w2_weight_offset.data = layer.w2_weight_offset.data.view( + layer.w2_weight_offset.data.shape[0], -1) + expanding_factor_w13 = layer.w13_weight.data.shape[1] + expanding_factor_w2 = layer.w2_weight.data.shape[1] + + if is_310p(): + layer.w13_input_scale.data = torch.nn.Parameter( + layer.w13_input_scale.data.max()) + layer.w2_input_scale.data = torch.nn.Parameter( + layer.w2_input_scale.data.max()) + else: + layer.w13_input_scale.data = torch.nn.Parameter( + layer.w13_input_scale.data.repeat(1, + expanding_factor_w13)[0:1]) + layer.w2_input_scale.data = torch.nn.Parameter( + layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1]) + + layer.w13_input_offset.data = torch.nn.Parameter( + layer.w13_input_scale.data.repeat(1, expanding_factor_w13)[0:1]) + layer.w2_input_offset.data = torch.nn.Parameter( + layer.w2_input_scale.data.repeat(1, expanding_factor_w2)[0:1]) + + # converting ACL_FORMAT_FRACTAL_NZ. + # npu_quant_grouped_matmul_dequant in eager mode does not accept + # ACL_FORMAT_FRACTAL_NZ. + if not is_310p(): + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous() + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous() + + +class AscendC8KVCacheMethod: + + def __init__(self) -> None: + self.antiquant_scale_comb = None + + @staticmethod + def create_weights(layer) -> None: + param_dict = {} # num_kv_heads * head_size + param_dict["key_antiquant_scale"] = torch.empty(layer.num_kv_heads * + layer.head_size, + dtype=torch.float16, + requires_grad=False) + param_dict["value_antiquant_scale"] = torch.empty(layer.num_kv_heads * + layer.head_size, + dtype=torch.float16, + requires_grad=False) + for weight_name, weight_param in param_dict.items(): + param = torch.nn.Parameter(weight_param, requires_grad=False) + layer.register_parameter(weight_name, param) + + def process_weights_after_loading(self, layer): + self.antiquant_scale_comb = torch.cat( + (layer.key_antiquant_scale.data.unsqueeze(0), + layer.value_antiquant_scale.data.unsqueeze(0)), + dim=0).to(torch.float16).contiguous() + + def apply(self, layer, query, key, value, kv_cache, attn_metadata, + attn_type, scale, output) -> torch.Tensor: + num_tokens = query.shape[0] + if attn_metadata is None: + return output.view(num_tokens, layer.num_heads * layer.head_size) + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "PallasAttentionBackendImpl") + + # C8 + quant_key = quant_per_tensor( + key.view(-1, layer.num_kv_heads * layer.head_size), + layer.key_antiquant_scale.data.view(-1), None, True) + quant_value = quant_per_tensor( + value.view(-1, layer.num_kv_heads * layer.head_size), + layer.value_antiquant_scale.data.view(-1), None, True) + + # View q k v to BSH. + query = query.view(-1, layer.num_heads, layer.head_size) + key = key.view(-1, layer.num_kv_heads, layer.head_size) + value = value.view(-1, layer.num_kv_heads, layer.head_size) + # TODO: Remove this contiguous in the future. + value = value.contiguous() + + if kv_cache[0].numel() > 0: + # if key_cache is None: + key_cache, value_cache = kv_cache[0], kv_cache[1] + slots = attn_metadata.slot_mapping + + block_size = key_cache.shape[1] + slots_indices = slots.reshape(-1, 1) + block_indices = slots_indices // block_size + slots_indices = slots_indices % block_size + indices = torch.cat((block_indices, slots_indices), dim=1) + + # C8 + torch_npu.npu_scatter_nd_update_(key_cache, indices, quant_key) + torch_npu.npu_scatter_nd_update_(value_cache, indices, quant_value) + + # V0-Style scheduler situation. + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + mask = attn_metadata.attn_mask + torch_npu._npu_flash_attention(query=query, + key=key, + value=value, + mask=mask, + seq_len=attn_metadata.seq_lens, + scale_value=scale, + num_heads=layer.num_heads, + num_kv_heads=layer.num_kv_heads, + out=output.reshape(query.shape)) + + elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit: + raise NotImplementedError("kv cache int8 are not " + "implemented for " + "PrefillCacheHit") + elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: # changed attn_metadata.attn_state == AscendAttentionState.DecodeOnly + if hasattr(attn_metadata, "decode"): + # torch_air + decode_meta = attn_metadata.decode + seq_lens = decode_meta.seq_lens_list + else: + seq_lens = attn_metadata.seq_lens + block_size = key_cache.shape[1] + query = query.view(num_tokens, 1, layer.num_heads * + layer.head_size).contiguous() # changed + + # [num_blocks, block_size, N, D] --> [num_blocks, N, block_size, D] + key = key_cache + value = value_cache + + output = torch_npu.npu_incre_flash_attention( + query, + key, + value, + num_key_value_heads=layer.num_kv_heads, + num_heads=layer.num_heads, + actual_seq_lengths=seq_lens, + scale_value=scale, + input_layout='BSH', + block_size=block_size, + block_table=attn_metadata.block_tables, + antiquant_scale=self.antiquant_scale_comb, + ) + + # Normal V1 situation. + else: + raise NotImplementedError("kv cache int8 are not " + "implemented for " + "other case") + return output + + +def fused_experts_310p( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w1_input_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + w2_input_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + global_num_experts: int, + expert_map: torch.Tensor = None, +) -> torch.Tensor: + ep_size = get_ep_group().world_size + local_num_experts = global_num_experts // ep_size + local_num_group = top_k // ep_size + + bsz, _ = hidden_states.shape + flatten_topk_ids = topk_ids.view(-1) + sorted_topk_ids = torch.argsort(flatten_topk_ids.float()) + sorted_topk_ids = sorted_topk_ids.to(torch.int32) + sorted_hidden_states = hidden_states.index_select( + 0, sorted_topk_ids // local_num_group) + + experts_id = torch.arange(0, + local_num_experts, + dtype=topk_ids.dtype, + device=topk_ids.device) + num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to( + torch.float32).sum(0) + topk_scales = topk_weights.view(-1).index_select( + 0, sorted_topk_ids).unsqueeze(-1) + group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64) + + gate_up_out = torch_npu.npu_quant_grouped_matmul_dequant( + x=sorted_hidden_states, + quantized_weight=w1, + weight_scale=w1_scale, + group_list=group_list, + x_scale=w1_input_scale, + quant_mode="pertensor") + + gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( + torch.float16) + gate_up_out *= topk_scales + + down_out = torch_npu.npu_quant_grouped_matmul_dequant( + x=gate_up_out, + quantized_weight=w2, + weight_scale=w2_scale, + group_list=group_list, + x_scale=w2_input_scale, + quant_mode="pertensor") + + unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32) + unsorted_hidden_states = down_out.index_select(0, unsorted_topk_ids) + final_hidden_states = unsorted_hidden_states.reshape( + bsz, top_k // ep_size, -1).sum(1) + + return final_hidden_states + + +def fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w1_input_scale: torch.Tensor, + w1_input_offset: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + w2_input_scale: torch.Tensor, + w2_input_offset: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + global_num_experts: int, + expert_map: torch.Tensor = None, +) -> torch.Tensor: + """ + Fused experts with top-k routing. + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size). + w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size). + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + top_k: Number of experts to select. + expert_map: Expert mapping of shape (num_experts,). + + Returns: + hidden_states: Hidden states after routing. + """ + """ + # Check constraints. + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + """ + + original_dtype = hidden_states.dtype + ep_size = get_ep_group().world_size + local_num_experts = global_num_experts // ep_size + w1_input_scale, _ = w1_input_scale.max(0) + quant_sorted_hidden_states = quant_per_tensor( + hidden_states, + w1_input_scale, + None, + True, + ) + if expert_map is not None: + expanded_x, expanded_row_idx, expert_token_count, expanded_scale = torch_npu.npu_moe_init_routing_v2( + quant_sorted_hidden_states, + topk_ids, + scale=None, + active_num=topk_ids.numel(), + expert_capacity=-1, + expert_num=local_num_experts, + drop_pad_mode=0, + expert_tokens_num_type=1, + expert_tokens_num_flag=True, + quant_mode=-1, + active_expert_range=[0, local_num_experts], + row_idx_type=0, + ) + + else: + raise NotImplementedError( + "The quantified version of MOE class models " + "currently does not support tensor parallelism") + if expanded_x.dtype != w1.dtype: + w1_input_scale, _ = w1_input_scale.max(0) + quant_sorted_hidden_states = quant_per_tensor( + expanded_x, + w1_input_scale, + None, + True, + ) + else: + quant_sorted_hidden_states = expanded_x + gate_up_out = torch_npu.npu_grouped_matmul( + x=[quant_sorted_hidden_states], + weight=[w1], + scale=[w1_scale * w1_input_scale[0]], + split_item=2, + group_list_type=1, + group_type=0, + group_list=expert_token_count, + output_dtype=original_dtype, + )[0] + gate_up_out = torch_npu.npu_swiglu(gate_up_out) + + if gate_up_out.dtype != w2.dtype: + w2_input_scale, _ = w2_input_scale.max(0) + quant_gate_up_out = quant_per_tensor( + gate_up_out, + w2_input_scale, + None, + True, + ) + else: + quant_gate_up_out = gate_up_out + + down_out = torch_npu.npu_grouped_matmul( + x=[quant_gate_up_out], + weight=[w2], + scale=[w2_scale * w2_input_scale[0]], + split_item=2, + group_list_type=1, + group_type=0, + group_list=expert_token_count, + output_dtype=original_dtype, + )[0] + + if expert_map is not None: + final_hidden_states = torch_npu.npu_moe_finalize_routing( + down_out, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights.to(down_out.dtype), + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + drop_pad_mode=2, + ) + else: + raise NotImplementedError( + "The quantified version of MOE class models " + "currently does not support tensor parallelism") + + return final_hidden_states diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py new file mode 100644 index 0000000..20c68be --- /dev/null +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -0,0 +1,453 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import torch +import torch_npu +from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.distributed import get_ep_group +from vllm.forward_context import get_forward_context + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.ops.common_fused_moe import \ + fused_experts as unified_fused_experts +from vllm_ascend.ops.fused_moe import unified_fused_experts_eager +from vllm_ascend.ops.layers.experts_selector import select_experts +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, dispose_tensor + + +def apply_mlp_decode(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + Args: + hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + w2_scale: weights2 scale with shape (num_experts, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + Returns: + hidden_states: output hidden states after MLP. + """ + + if dynamic_scale is None: + unquantized_hidden_states = hidden_states + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + # Dispose the original unquantized hidden states + # to save npu memory because they're no longer used. + dispose_tensor(unquantized_hidden_states) + else: + pertoken_scale = dynamic_scale + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=3, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=torch.int32)[0] + + # act_fn: swiglu + hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=w1_scale, + activation_scale=pertoken_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=group_list, + activate_left=True, + quant_mode=1, + ) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=w2_scale.dtype)[0] + return hidden_states + + +def apply_mlp(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + + Args: + hidden_states: input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + w2_scale: weights2 scale with shape (num_experts, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + + Returns: + hidden_states: output hidden states after MLP. + """ + + if dynamic_scale is None: + unquantized_hidden_states = hidden_states + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + # Dispose the original unquantized hidden states + # to save npu memory because they're no longer used. + dispose_tensor(unquantized_hidden_states) + else: + pertoken_scale = dynamic_scale + + bias1, bias2 = None, None + _output_dtype = w2_scale.dtype + + if w1_scale_bias is not None: + if group_list_type == 0: + group_list = torch.cat( + [group_list[:1], torch.diff(group_list, dim=0)]) + group_list_type = 1 + bias1 = [w1_scale_bias] + bias2 = [w2_scale_bias] + # TODO w4a8 scene: dynamic acquisition of dtype in the future + _output_dtype = torch.bfloat16 + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + scale=[w1_scale], + bias=bias1, + per_token_scale=[pertoken_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + + # act_fn: swiglu + hidden_states = torch_npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant( + hidden_states) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + bias=bias2, + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + + return hidden_states + + +class AscendW8A8DynamicLinearMethod: + """Linear method for Ascend W8A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def get_weight(input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def get_perchannel_param( + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + return params_dict + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def apply( + layer: torch.nn.Module, + x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, + ) -> torch.Tensor: + config = getattr(layer, "_ascend_quant_config", {}) + if not isinstance(x, tuple): + output_dtype = config.get("output_dtype", x.dtype) + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + assert "output_dtype" in config.keys(), ( + f"DynamicLinearMethod needs explicitly specified `output_dtype`" + f"for pre-quantized input, got config [{config}]") + output_dtype = config["output_dtype"] + quantized_x, dynamic_scale = x + pertoken_scale = (dynamic_scale + if config.get("pertoken_scale", True) else None) + + output = torch_npu.npu_quant_matmul( + quantized_x, + layer.weight, + layer.weight_scale, + pertoken_scale=pertoken_scale, + bias=bias, + output_dtype=output_dtype, + ) + return ((output, dynamic_scale) + if config.get("return_scale", False) else output) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + # cast quantized weight tensors in NZ format (29) for higher inference speed + layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29) + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + + +class AscendW8A8DynamicFusedMoEMethod: + """FusedMoe method for Ascend W8A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + self.ep_group = get_ep_group() + + vllm_config = get_current_vllm_config() + ascend_config = get_ascend_config() + self.use_aclgraph = ( + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE + and not vllm_config.model_config.enforce_eager + and not ascend_config.torchair_graph_config.enabled) + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = "" + + @staticmethod + def get_weight(num_experts: int, intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight"] = torch.empty(num_experts, + 2 * + intermediate_size_per_partition, + hidden_sizes, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(num_experts, + hidden_sizes, + intermediate_size_per_partition, + dtype=torch.int8) + return param_dict + + @staticmethod + def get_dynamic_quant_param(num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = True, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + topk_weights, topk_ids, row_idx = select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + global_num_experts=global_num_experts) + + if self.use_aclgraph: + return unified_fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + use_int8_w8a8=True, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + expert_map=expert_map, + ) + + fused_moe_state = get_forward_context().fused_moe_state + shared_gate_up, shared_dequant_scale = None, None + if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: + share_up_out, _ = shared_experts.gate_up_proj( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_gate_up, shared_dequant_scale = share_up_out[ + 0], share_up_out[1] + + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + + return unified_fused_experts_eager( + hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + row_idx=row_idx, + expert_map=expert_map, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + shared_gate_up=shared_gate_up, + shared_dequant_scale=shared_dequant_scale, + mc2_mask=kwargs.get("mc2_mask", None), + with_quant=True) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + if envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP: + torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ) + layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( + layer.w13_weight_scale.data.shape[0], -1) + layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to( + torch.float32) + layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( + layer.w13_weight_offset.data.shape[0], -1) + layer.w2_weight_scale.data = layer.w2_weight_scale.data.view( + layer.w2_weight_scale.data.shape[0], -1) + layer.w2_weight_offset.data = layer.w2_weight_offset.data.view( + layer.w2_weight_offset.data.shape[0], -1) diff --git a/vllm_ascend/sample/__init__.py b/vllm_ascend/sample/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py new file mode 100644 index 0000000..e0d770d --- /dev/null +++ b/vllm_ascend/sample/rejection_sampler.py @@ -0,0 +1,504 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch +import torch.nn as nn +import vllm.v1.sample.rejection_sampler as rs +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs, + generate_uniform_probs) +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata + +PLACEHOLDER_TOKEN_ID = -1 +GREEDY_TEMPERATURE = -1 +# Maximum number of speculative draft tokens allowed per request in a single +# step. This value is chosen to be large enough to handle typical use cases. +MAX_SPEC_LEN = 32 + + +class AscendRejectionSampler(RejectionSampler, nn.Module): + """ + The implementation strictly follows the algorithm described in + https://arxiv.org/abs/2211.17192. + However, we want to clarify the terminology used in the implementation: + accepted tokens: tokens that are accepted based on the relationship + between the "raw" draft and target probabilities. + recovered tokens: tokens that are sampled based on the adjusted probability + distribution, which is derived from both the draft and target + probabilities. + bonus tokens: + If all proposed tokens are accepted, the bonus token is added to the + end of the sequence. The bonus token is only sampled from the target + probabilities. We pass in the bonus tokens instead of sampling them + in the rejection sampler to allow for more flexibility in the + sampling process. For example, we can use top_p, top_k sampling for + bonus tokens, while spec decode does not support these sampling + strategies. + output tokens: + Tokens are finally generated with the rejection sampler. + output tokens = accepted tokens + recovered tokens + bonus tokens + """ + + def forward( + self, + metadata: SpecDecodeMetadata, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_logits: torch.Tensor, + # [batch_size, 1] + bonus_token_ids: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + ''' + Args: + metadata: + Metadata for spec decoding. + draft_probs (Optional[torch.Tensor]): + Probability distribution for the draft tokens. Shape is + [num_tokens, vocab_size]. Can be None if probabilities are + not provided, which is the case for ngram spec decode. + target_logits (torch.Tensor): + Target model's logits probability distribution. + Shape is [num_tokens, vocab_size]. Here, probabilities from + different requests are flattened into a single tensor because + this is the shape of the output logits. + NOTE: `target_logits` can be updated in place to save memory. + bonus_token_ids_tensor (torch.Tensor): + A tensor containing bonus tokens. Shape is [batch_size, 1]. + Bonus tokens are added to the end of the sequence if all + proposed tokens are accepted. We generate the bonus tokens + outside of the rejection sampler with the default sampling + strategy. It allows for more flexibility in the sampling + process such as top_p, top_k sampling. + sampling_metadata (SamplingMetadata): + Additional metadata needed for sampling, such as temperature, + top-k/top-p parameters, or other relevant information. + Returns: + output_token_ids (torch.Tensor): + A tensor containing the final output token IDs. + ''' + assert metadata.max_spec_len <= MAX_SPEC_LEN + # [num_tokens, vocab_size] + # NOTE(woosuk): `target_logits` can be updated in place inside the + # `compute_probs` function. + target_probs = compute_probs( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + + output_token_ids = rejection_sample( + metadata.draft_token_ids, + metadata.num_draft_tokens, + metadata.max_spec_len, + metadata.cu_num_draft_tokens, + draft_probs, + target_probs, + bonus_token_ids, + sampling_metadata, + ) + return output_token_ids + + +def rejection_sample( + # [num_tokens] + draft_token_ids: torch.Tensor, + # [batch_size] + num_draft_tokens: list[int], + max_spec_len: int, + # [batch_size] + cu_num_draft_tokens: torch.Tensor, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_probs: torch.Tensor, + # [batch_size, 1] + bonus_token_ids: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> torch.Tensor: + assert draft_token_ids.ndim == 1 + assert draft_probs is None or draft_probs.ndim == 2 + assert cu_num_draft_tokens.ndim == 1 + assert target_probs.ndim == 2 + + batch_size = len(num_draft_tokens) + num_tokens = draft_token_ids.shape[0] + vocab_size = target_probs.shape[-1] + device = target_probs.device + assert draft_token_ids.is_contiguous() + assert draft_probs is None or draft_probs.is_contiguous() + assert target_probs.is_contiguous() + assert bonus_token_ids.is_contiguous() + assert target_probs.shape == (num_tokens, vocab_size) + + # Create output buffer. + output_token_ids = torch.empty( + (batch_size, max_spec_len + 1), + dtype=torch.int32, # Consistent with SamplerOutput.sampled_token_ids. + device=device, + ) + output_token_ids.fill_(PLACEHOLDER_TOKEN_ID) + + if sampling_metadata.all_greedy: + is_greedy = None + else: + is_greedy = sampling_metadata.temperature == GREEDY_TEMPERATURE + if not sampling_metadata.all_random: + # Rejection sampling for greedy sampling requests. + target_argmax = target_probs.argmax(dim=-1) + if min(num_draft_tokens) == 1 and max( + num_draft_tokens) == 1 and sampling_metadata.all_greedy: + rejection_greedy_sample_spec_len_1_pytorch( + output_token_ids, + draft_token_ids, + target_argmax, + bonus_token_ids, + ) + else: + rejection_greedy_sample_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + target_argmax, + bonus_token_ids, + num_draft_tokens, + max_spec_len, + is_greedy, + ) + if sampling_metadata.all_greedy: + return output_token_ids + + # Generate uniform probabilities for rejection sampling. + # [num_tokens] + uniform_probs = generate_uniform_probs( + num_tokens, + num_draft_tokens, + sampling_metadata.generators, + device, + ) + + # Sample recovered tokens for each position. + # [num_tokens] + recovered_token_ids = sample_recovered_tokens( + max_spec_len, + num_draft_tokens, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + sampling_metadata, + device, + ) + + # Rejection sampling for random sampling requests. + rejection_random_sample_pytorch( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + bonus_token_ids, + recovered_token_ids, + uniform_probs, + is_greedy, + max_spec_len, + vocab_size, + IS_NGRAM=draft_probs is None, + # num_warps=1, + ) + return output_token_ids + + +def expand_batch_to_tokens( + x: torch.Tensor, # [batch_size] + cu_num_tokens: torch.Tensor, # [batch_size] + num_tokens: int, + replace_from: int = 0, + replace_to: int = 0, +) -> torch.Tensor: + """Expand [batch_size] tensor to [num_tokens] tensor based on the number of + tokens per batch in cu_num_tokens. + + For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then + num_tokens = 6, and expanded_x = [a, a, b, b, b, c]. + + Args: + x: [batch_size] tensor to expand. + cu_num_tokens: [batch_size] tensor containing the cumulative number of + tokens per batch. Each element represents the total number of + tokens up to and including that batch. + num_tokens: Total number of tokens. + replace_from: int = 0 + Value to be replaced if it is found in x. + replace_to: int = 0 + Value to replace with when replace_from is found. + Returns: + expanded_x: [num_tokens] tensor. + """ + batch_size = x.shape[0] + assert cu_num_tokens.shape[0] == batch_size + expanded_x = x.new_empty(num_tokens) + expand_pytorch( + expanded_x, + x, + cu_num_tokens, + replace_from, + replace_to, + MAX_NUM_TOKENS=MAX_SPEC_LEN, # To avoid recompilation. + ) + return expanded_x + + +def sample_recovered_tokens( + max_spec_len: int, + num_draft_tokens: list[int], + # [batch_size] + cu_num_draft_tokens: torch.Tensor, + # [num_tokens] + draft_token_ids: torch.Tensor, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_probs: torch.Tensor, + sampling_metadata: SamplingMetadata, + device: torch.device, +) -> torch.Tensor: + # NOTE(woosuk): Create only one distribution for each request. + batch_size = len(num_draft_tokens) + vocab_size = target_probs.shape[-1] + q = torch.empty( + (batch_size, vocab_size), + dtype=torch.float32, + device=device, + ) + q.exponential_() + for i, generator in sampling_metadata.generators.items(): + # Do not generate random numbers for requests with no draft tokens. + # This can be important for reproducibility. + if num_draft_tokens[i] > 0: + q[i].exponential_(generator=generator) + + recovered_token_ids = torch.empty_like(draft_token_ids) + sample_recovered_tokens_pytorch( + recovered_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + q, + vocab_size, + IS_NGRAM=draft_probs is None, + ) + return recovered_token_ids + + +def rejection_greedy_sample_spec_len_1_pytorch( + output_token_ids, # [batch_size, 2] + draft_token_ids, # [num_tokens] + target_argmax, # [num_tokens] + bonus_token_ids, # [batch_size] +): + batch_size = output_token_ids.size(0) + num_tokens = draft_token_ids.size(0) + assert batch_size == num_tokens + accept_req_mask = draft_token_ids == target_argmax + output_token_ids[:, 0] = target_argmax + bonus_token_ids = bonus_token_ids.squeeze(1) + output_token_ids[accept_req_mask, 1] = bonus_token_ids[accept_req_mask] + + +def rejection_greedy_sample_pytorch( + output_token_ids, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens, # [batch_size] + draft_token_ids, # [num_tokens] + target_argmax, # [num_tokens] + bonus_token_ids, # [batch_size] + draft_tokens_per_req, # [batch_size], list + max_spec_len, + is_greedy=None, # [batch_size] or None +): + batch_size = output_token_ids.size(0) + num_tokens = draft_token_ids.size(0) + device = output_token_ids.device + draft_tokens_per_req = torch.tensor(draft_tokens_per_req).to( + device, non_blocking=True) + if is_greedy is None: + is_greedy = torch.ones(batch_size, dtype=torch.bool, device=device) + + start_indices = cu_num_draft_tokens - draft_tokens_per_req + req_ids = torch.arange(batch_size, device=device) + token_req_ids = torch.repeat_interleave(req_ids, draft_tokens_per_req) + token_positions = torch.arange( + num_tokens, device=device) - start_indices[token_req_ids] + + # Find the first mismatch position of each request. + mismatch_global = (draft_token_ids != target_argmax) + if max_spec_len == 0: + first_mismatch_pos_per_req = torch.zeros(batch_size, + dtype=torch.long, + device=device) + else: + # [bs, max_spec_len] + pos_matrix = torch.full((batch_size, max_spec_len), + -1, + dtype=torch.long, + device=device) + pos_matrix[token_req_ids, token_positions] = token_positions + mismatch_matrix = torch.full((batch_size, max_spec_len), + False, + dtype=torch.bool, + device=device) + mismatch_matrix[token_req_ids, token_positions] = mismatch_global + mismatch_positions = torch.where(mismatch_matrix, pos_matrix, + max_spec_len * 2) + first_mismatch_pos_per_req, _ = torch.min(mismatch_positions, dim=1) + no_mismatch_mask = (first_mismatch_pos_per_req == max_spec_len * 2) + first_mismatch_pos_per_req[no_mismatch_mask] = draft_tokens_per_req[ + no_mismatch_mask] + + # Copy matched target tokens into output. + copy_len = torch.minimum(first_mismatch_pos_per_req + 1, + draft_tokens_per_req) + copy_indices = torch.arange(max_spec_len + 1, + device=device).expand(batch_size, -1) + copy_mask = copy_indices < copy_len.unsqueeze(1) + greedy_mask = is_greedy.unsqueeze(1) + final_copy_mask = copy_mask & greedy_mask + global_idx = start_indices.unsqueeze(1) + copy_indices + output_token_ids[final_copy_mask] = target_argmax[ + global_idx[final_copy_mask]].to(output_token_ids.dtype) + # Fill bonus token. + needs_bonus = is_greedy & (first_mismatch_pos_per_req + >= draft_tokens_per_req) + if torch.any(needs_bonus): + bonus_rows = torch.where(needs_bonus)[0] + bonus_cols = draft_tokens_per_req[bonus_rows] + bonus_token_ids = bonus_token_ids.squeeze(1) + output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows] + + +def rejection_random_sample_pytorch( + output_token_ids, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens, # [batch_size] + draft_token_ids, # [num_tokens] + draft_probs, # [num_tokens, vocab_size] or None + target_probs, # [num_tokens, vocab_size] + bonus_token_ids, # [batch_size] + recovered_token_ids, # [num_tokens] + uniform_probs, # [num_tokens] + is_greedy, # [batch_size] + max_spec_len, + vocab_size, + IS_NGRAM=False, +): + batch_size = output_token_ids.shape[0] + + for req_idx in range(batch_size): + if is_greedy[req_idx]: + continue + + if req_idx == 0: + start_idx = 0 + else: + start_idx = cu_num_draft_tokens[req_idx - 1].item() + end_idx = cu_num_draft_tokens[req_idx].item() + num_draft_tokens = end_idx - start_idx + + rejected = False + for pos in range(num_draft_tokens): + if not rejected: + draft_token_id = draft_token_ids[start_idx + pos].item() + + if IS_NGRAM: + draft_prob = 1.0 + else: + draft_prob = draft_probs[start_idx + pos, + draft_token_id].item() + + target_prob = target_probs[start_idx + pos, + draft_token_id].item() + uniform_prob = uniform_probs[start_idx + pos].item() + + if draft_prob > 0 and target_prob / draft_prob >= uniform_prob: + token_id = draft_token_id + else: + rejected = True + token_id = recovered_token_ids[start_idx + pos].item() + + output_token_ids[req_idx, pos] = token_id + + if not rejected: + bonus_token_id = bonus_token_ids[req_idx].item() + output_token_ids[req_idx, num_draft_tokens] = bonus_token_id + + +def expand_pytorch( + output_ptr, # [num_tokens] + input_ptr, # [batch_size] + cu_num_tokens_ptr, # [batch_size] + replace_from, + replace_to, + MAX_NUM_TOKENS, +): + batch_size = len(input_ptr) + + for req_idx in range(batch_size): + start_idx = 0 if req_idx == 0 else cu_num_tokens_ptr[req_idx - 1] + end_idx = cu_num_tokens_ptr[req_idx] + num_tokens = end_idx - start_idx + + src_val = input_ptr[req_idx] + src_val = replace_to if src_val == replace_from else src_val + + offset = torch.arange(MAX_NUM_TOKENS, device=num_tokens.device) + mask = offset < num_tokens + + output_slice = start_idx + offset[mask] + output_ptr[output_slice] = src_val + + +def sample_recovered_tokens_pytorch( + output_token_ids, # [num_tokens] + cu_num_draft_tokens, # [batch_size] + draft_token_ids, # [num_tokens] + draft_probs, # [num_tokens, vocab_size] or None + target_probs, # [num_tokens, vocab_size] + q, # [batch_size, vocab_size] + vocab_size, + IS_NGRAM=False, +): + batch_size = len(cu_num_draft_tokens) + + for req_idx in range(batch_size): + start_idx = 0 if req_idx == 0 else cu_num_draft_tokens[req_idx - 1] + end_idx = cu_num_draft_tokens[req_idx] + num_draft_tokens = end_idx - start_idx + + for pos in range(num_draft_tokens): + token_idx = start_idx + pos + + if IS_NGRAM: + draft_token_id = draft_token_ids[token_idx] + orig_prob = target_probs[token_idx, draft_token_id].item() + target_probs[token_idx, draft_token_id] = 0 + prob = target_probs[token_idx].clone() + else: + draft_p = draft_probs[token_idx].clone() + target_p = target_probs[token_idx].clone() + prob = torch.maximum(target_p - draft_p, + torch.tensor(0.0, device=target_p.device)) + + q_values = torch.full((vocab_size, ), + float('-inf'), + device=q.device) + q_values[:vocab_size] = q[req_idx, :vocab_size] + + recovered_id = torch.argmax(prob / q_values).item() + output_token_ids[token_idx] = recovered_id + + if IS_NGRAM: + target_probs[token_idx, draft_token_id] = orig_prob + + +rs.expand_batch_to_tokens = expand_batch_to_tokens diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py new file mode 100644 index 0000000..b5a212a --- /dev/null +++ b/vllm_ascend/sample/sampler.py @@ -0,0 +1,86 @@ +import torch +import torch_npu +from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample +from vllm.v1.sample.sampler import Sampler + +from vllm_ascend.utils import is_310p, vllm_version_is + +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + from vllm.config import LogprobsMode + DEFAULT_LOGPROBS_MODE = LogprobsMode.RAW_LOGPROBS +else: + LogprobsMode = None + DEFAULT_LOGPROBS_MODE = "raw_logprobs" + + +class AscendSampler(Sampler): + + def __init__(self, logprobs_mode=DEFAULT_LOGPROBS_MODE): + # TODO: support logprobs_mode in vllm-ascend + super().__init__(logprobs_mode=logprobs_mode) + self.topk_topp_sampler = AscendTopKTopPSampler() + + +class AscendTopKTopPSampler(TopKTopPSampler): + + def _apply_top_k_top_p( + self, + logits: torch.Tensor, + k: torch.Tensor, + p: torch.Tensor, + ) -> torch.Tensor: + # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P + if not is_310p() and p is not None and k is not None: + # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p) + return torch_npu.npu_top_k_top_p(logits, p, k) + + if p is None and k is None: + return logits + + probs = logits.softmax(dim=-1) + probs_sort, _ = probs.sort(dim=-1, descending=False) + + if k is not None: + top_k_count = probs_sort.size(1) - k.to( + torch.long) # shape: (batch, ) + top_k_count = top_k_count.unsqueeze(dim=1) + top_k_cutoff = probs_sort.gather(-1, top_k_count) + + # Make sure the no top-k rows are no-op. + no_top_k_mask = (k == logits.shape[1]).unsqueeze(dim=1) + top_k_cutoff.masked_fill_(no_top_k_mask, -float("inf")) + + elements_to_discard = probs < top_k_cutoff + logits.masked_fill_(elements_to_discard, -float("inf")) + + if p is not None: + cumprob = torch.cumsum(probs_sort, dim=-1) + top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1) + top_p_mask[:, -1] = False # at least one + + top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1) + top_p_cutoff = probs_sort.gather(-1, top_p_count) + elements_to_discard = probs < top_p_cutoff + logits.masked_fill_(elements_to_discard, -float("inf")) + + return logits + + def forward_native(self, logits, generators, k, p): + """Override pytorch native implementation to torch_npu""" + logits = self._apply_top_k_top_p(logits, k, p) + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + + logits_to_return = None + if self.logprobs_mode == LogprobsMode.PROCESSED_LOGITS: + logits_to_return = logits + elif self.logprobs_mode == LogprobsMode.PROCESSED_LOGPROBS: + logits_to_return = logits.log_softmax(dim=-1, + dtype=torch.float32) + + probs = logits.softmax(dim=-1, dtype=torch.float32) + output = None + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + output = random_sample(probs, generators) + else: + output = (random_sample(probs, generators), logits_to_return) + return output diff --git a/vllm_ascend/torchair/__init__.py b/vllm_ascend/torchair/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/torchair/models/__init__.py b/vllm_ascend/torchair/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/torchair/models/qwen2.py b/vllm_ascend/torchair/models/qwen2.py new file mode 100644 index 0000000..3537aa8 --- /dev/null +++ b/vllm_ascend/torchair/models/qwen2.py @@ -0,0 +1,364 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. + +from collections.abc import Iterable +from typing import Any, List, Optional, Union + +import torch +import torch.nn.functional as F +import vllm +import vllm.envs as envs +from torch import nn +from transformers import Qwen2Config +from vllm.attention import AttentionMetadata, AttentionType +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import (get_pp_group, tensor_model_parallel_all_gather, + tensor_model_parallel_reduce_scatter) +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP +from vllm.model_executor.models.qwen2 import Qwen2Attention # noqa: F401 +from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM # noqa: F401 +from vllm.model_executor.models.qwen2 import Qwen2MLP, Qwen2Model +from vllm.model_executor.models.utils import (AutoWeightsLoader, + PPMissingLayer, maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention_v1 import AscendAttentionState + + +def all_gather_and_maybe_unpad( + hidden_states: torch.Tensor, + pad_size: int, +) -> torch.Tensor: + hidden_states = tensor_model_parallel_all_gather(hidden_states, 0) + if pad_size > 0: + return hidden_states[:-pad_size, :] + return hidden_states + + +def maybe_pad_and_reduce_scatter( + hidden_states: torch.Tensor, + pad_size: int, +) -> torch.Tensor: + if pad_size > 0: + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_size)) + hidden_states = tensor_model_parallel_reduce_scatter(hidden_states, 0) + return hidden_states + + +class CustomQwen2Attention(Qwen2Attention): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + max_position: int = 4096 * 32, + rope_theta: float = 10000, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + rope_scaling: Optional[tuple] = None, + prefix: str = "", + attn_type: str = AttentionType.DECODER, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, + ) -> None: + super().__init__( + hidden_size=hidden_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_position=max_position, + rope_theta=rope_theta, + cache_config=cache_config, + quant_config=quant_config, + rope_scaling=rope_scaling, + prefix=prefix, + attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config) + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.torchair_graph_enabled and attn_metadata is not None and attn_metadata.attn_state == AscendAttentionState.DecodeOnly: + q, k = self.rotary_emb(positions, + q, + k, + is_prefill=False, + is_qwen_torchair=True) + forward_kwargs = {} + if envs.VLLM_USE_V1: + output_shape = q.shape + output = torch.empty(output_shape, + dtype=q.dtype, + device=q.device) + forward_kwargs['output'] = output + + attn_output = self.attn.impl.forward(self.attn, + q, + k, + v, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + trace_flag=False, + **forward_kwargs) + output, _ = self.o_proj(attn_output) + return output + else: + if type(self.rotary_emb) is RotaryEmbedding: + q, k = self.rotary_emb(positions, q, k, is_qwen_torchair=True) + else: + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class CustomQwen2DecoderLayer(nn.Module): + + def __init__( + self, + config: Qwen2Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + # Requires transformers > 4.32.0 + rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) + dual_chunk_attention_config = getattr(config, + "dual_chunk_attention_config", + None) + + # By default, Qwen2 uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + + self.self_attn = CustomQwen2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + max_position=config.max_position_embeddings, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + cache_config=cache_config, + quant_config=quant_config, + rope_scaling=rope_scaling, + prefix=f"{prefix}.self_attn", + attn_type=attn_type, + dual_chunk_attention_config=dual_chunk_attention_config, + ) + self.mlp = Qwen2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) +class CustomQwen2Model(Qwen2Model): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + decoder_layer_type: type[nn.Module] = CustomQwen2DecoderLayer): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + decoder_layer_type=decoder_layer_type) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + kv_cache = kv_caches[i - self.start_layer] \ + if kv_caches is not None else None + hidden_states, residual = layer(positions, + hidden_states, + residual, + kv_cache=kv_cache, + attn_metadata=attn_metadata) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class CustomQwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + # add `CustomQwen2Model` to init self.model + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = CustomQwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) + + +vllm.model_executor.models.qwen2.Qwen2ForCausalLM = CustomQwen2ForCausalLM diff --git a/vllm_ascend/torchair/models/qwen3_moe.py b/vllm_ascend/torchair/models/qwen3_moe.py new file mode 100644 index 0000000..dd4a592 --- /dev/null +++ b/vllm_ascend/torchair/models/qwen3_moe.py @@ -0,0 +1,537 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from vllm/model_executor/models/qwen3_moe.py +# This file is a part of the vllm-ascend project. +from typing import Any, List, Optional, Union + +import torch +import vllm.envs as envs +from torch import nn +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, CompilationLevel, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.layer import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.models.interfaces import (MixtureOfExperts, + SupportsLoRA, SupportsPP) +from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention, + Qwen3MoeDecoderLayer, + Qwen3MoeForCausalLM, + Qwen3MoeMLP, Qwen3MoeModel, + Qwen3MoeSparseMoeBlock) +from vllm.model_executor.models.utils import ( + PPMissingLayer, extract_layer_index, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.ops.fused_moe import AscendFusedMoE +from vllm_ascend.ops.sequence_parallel import (MetadataForPadding, + init_metadata_for_sp) + + +class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + nn.Module.__init__(self) + self.tp_size = get_tensor_model_parallel_world_size() + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + self.experts = AscendFusedMoE( + num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + prefix=f"{prefix}.experts", + ) + + self.top_k = config.num_experts_per_tok + + self.dp_size = get_dp_group().world_size + + self.tp_group = get_tp_group().device_group + self.tp_rank = get_tp_group().rank_in_group + self.ep_group = get_ep_group() + + self.params_dtype = torch.get_default_dtype() + + def forward( + self, + hidden_states, + attn_metadata=None, + _metadata_for_padding: Optional[MetadataForPadding] = None, + ): + if attn_metadata is None: + attn_metadata = get_forward_context().attn_metadata + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + enable_force_load_balance = get_forward_context().in_profile_run + is_prefill = get_forward_context().with_prefill + + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + + hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=self.top_k, + enable_force_load_balance=enable_force_load_balance, + shared_experts=None, + _metadata_for_padding=_metadata_for_padding, + ) + + return hidden_states + + +class CustomQwen3MoeAttention(Qwen3MoeAttention): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-06, + qkv_bias: bool = False, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + nn.Module.__init__(self) + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") + + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention(self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn") + + self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + @staticmethod + def normalize_qkv(qkv: torch.Tensor, q_size: int, kv_size: int, + head_dim: int, q_norm, k_norm): + q, k, v = qkv.split([q_size, kv_size, kv_size], dim=-1) + + q_by_head = q.view(*q.shape[:-1], q.shape[-1] // head_dim, head_dim) + q_by_head = q_norm(q_by_head) + q = q_by_head.view(q.shape) + + k_by_head = k.view(*k.shape[:-1], k.shape[-1] // head_dim, head_dim) + k_by_head = k_norm(k_by_head) + k = k_by_head.view(k.shape) + + return q, k, v + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = self.normalize_qkv(qkv, self.q_size, self.kv_size, + self.head_dim, self.q_norm, self.k_norm) + + if (self.torchair_graph_enabled and attn_metadata is not None and + attn_metadata.attn_state == AscendAttentionState.DecodeOnly): + q, k = self.rotary_emb(positions, + q, + k, + is_prefill=False, + is_qwen_torchair=True) + forward_kwargs = {} + if envs.VLLM_USE_V1: + output_shape = q.shape + output = torch.empty(output_shape, + dtype=q.dtype, + device=q.device) + forward_kwargs['output'] = output + + attn_output = self.attn.impl.forward(self.attn, + q, + k, + v, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + trace_flag=False, + **forward_kwargs) + output, _ = self.o_proj(attn_output) + return output + else: + q, k = self.rotary_emb(positions, q, k, is_qwen_torchair=True) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + vllm_config: Optional[VllmConfig] = None, + prefix: str = "", + ) -> None: + + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = CustomQwen3MoeAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=getattr(config, 'attention_bias', False), + head_dim=getattr(config, 'head_dim', None), + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + # `mlp_only_layers` in the config. + layer_idx = extract_layer_index(prefix) + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + self.use_aclgraph = (vllm_config is not None + and vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE + and not vllm_config.model_config.enforce_eager) + if (layer_idx not in mlp_only_layers) and ( + config.num_experts > 0 and + (layer_idx + 1) % config.decoder_sparse_step == 0): + if not self.use_aclgraph: + # FIXME: custom sparse moe block doesn't work with aclgraph. + self.mlp = CustomSparseMoeBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + else: + self.mlp = Qwen3MoeSparseMoeBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + else: + self.mlp = Qwen3MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + self.enable_sequence_parallelism = ( + vllm_config.compilation_config.pass_config. + enable_sequence_parallelism if vllm_config is not None else False) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + _metadata_for_padding: Optional[MetadataForPadding] = None, + ) -> torch.Tensor: + + # To prevent precision issues during the decoder phase when only prefilling enables SP + if not self.enable_sequence_parallelism: + self.self_attn.o_proj.reduce_results = True + else: + self.self_attn.o_proj.reduce_results = not _metadata_for_padding.not_dummy_and_is_prefill if _metadata_for_padding is not None else True + + # Self Attention + if residual is None: + residual = hidden_states + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + residual = _metadata_for_padding.padding_slice(residual) + + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.allgather_unpadding_aligned( + hidden_states) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.padding_aligned_reduce_scatter( + hidden_states) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if not self.use_aclgraph: + hidden_states = self.mlp( + hidden_states, _metadata_for_padding=_metadata_for_padding) + else: + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class CustomQwen3MoeModel(Qwen3MoeModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + parallel_config = vllm_config.parallel_config + self.num_redundant_experts = parallel_config.num_redundant_experts + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + prefix=f"{prefix}.embed_tokens") + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: CustomQwen3MoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + vllm_config=vllm_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + _metadata_for_padding: Optional[MetadataForPadding] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata, + _metadata_for_padding=_metadata_for_padding) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + + if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill: + hidden_states = _metadata_for_padding.allgather_unpadding_aligned( + hidden_states) + + return hidden_states + + +class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + SupportsPP.__init__(self) + SupportsLoRA.__init__(self) + MixtureOfExperts.__init__(self) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = CustomQwen3MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head")) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism + # Set MoE hyperparameters + self.expert_weights: list[torch.Tensor] = [] + + self.moe_layers: list[FusedMoE] = [] + example_layer = None + for layer in self.model.layers: + if isinstance(layer, PPMissingLayer): + continue + + assert isinstance(layer, Qwen3MoeDecoderLayer) + if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock): + example_layer = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_layer is None: + raise RuntimeError("No Qwen3MoE layer found in the model.layers.") + + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + _metadata_for_padding = init_metadata_for_sp( + input_ids, self.enable_sequence_parallelism) + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds, _metadata_for_padding) + return hidden_states diff --git a/vllm_ascend/torchair/models/torchair_deepseek_mtp.py b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py new file mode 100644 index 0000000..6cb98a5 --- /dev/null +++ b/vllm_ascend/torchair/models/torchair_deepseek_mtp.py @@ -0,0 +1,218 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Adapted from vllm/model_executor/models/deepseek_mtp.py +# Copyright 2023 The vLLM team. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import CacheConfig, ModelConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.models.deepseek_mtp import ( + DeepSeekMTP, DeepSeekMultiTokenPredictor, DeepSeekMultiTokenPredictorLayer, + SharedHead) +from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from vllm_ascend.torchair.models.torchair_deepseek_v2 import \ + TorchairDeepseekV2DecoderLayer + + +class TorchairDeepSeekShareHead(SharedHead): + + def __init__(self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + nn.Module.__init__(self) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "head")) + + +class TorchairDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer + ): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + nn.Module.__init__(self) + + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.eh_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.shared_head = TorchairDeepSeekShareHead(config=config, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, + "shared_head")) + self.mtp_block = TorchairDeepseekV2DecoderLayer( + config, prefix, model_config, cache_config, quant_config) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds = torch.where((positions == 0).unsqueeze(-1), + torch.zeros_like(inputs_embeds), + inputs_embeds) + inputs_embeds = self.enorm(inputs_embeds) + previous_hidden_states = self.hnorm(previous_hidden_states) + + hidden_states = self.eh_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + residual=None) + hidden_states = residual + hidden_states + return hidden_states + + +class TorchairDeepSeekMultiTokenPredictor(DeepSeekMultiTokenPredictor): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + TorchairDeepSeekMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + model_config=vllm_config.model_config, + cache_config=vllm_config.cache_config, + quant_config=vllm_config.quant_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + + # Note: torch._dynamo.exc.Unsupported: builtin: str + self.layers_list = [ + self.layers[str(idx)] + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + ] + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: torch.Tensor, + attn_metadata: AttentionMetadata, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + current_step_idx = (spec_step_idx % self.num_mtp_layers) + step_kv_cache = kv_caches[ + current_step_idx] if kv_caches is not None else None + return self.layers_list[current_step_idx]( + input_ids, + positions, + step_kv_cache, + attn_metadata, + previous_hidden_states, + inputs_embeds, + current_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + current_step_idx = (spec_step_idx % self.num_mtp_layers) + mtp_layer = self.layers_list[current_step_idx] + logits = self.logits_processor(mtp_layer.shared_head.head, + mtp_layer.shared_head(hidden_states), + sampling_metadata) + return logits + + +class TorchairDeepSeekMTP(DeepSeekMTP): + # NOTE 1.The quantized MTP layer of deepseek on the NPU is not quantized; + # NOTE 2.The description file generated by the current msmodelslim tool does not have + # MTP layer info. Please manually add it and set the value to FLOAT. + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + self.config = vllm_config.model_config.hf_config + self.model = TorchairDeepSeekMultiTokenPredictor( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + + self.sampler = get_sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + previous_hidden_states: Optional[torch.Tensor] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, previous_hidden_states, + inputs_embeds, spec_step_idx) + return hidden_states diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v2.py b/vllm_ascend/torchair/models/torchair_deepseek_v2.py new file mode 100644 index 0000000..b31549d --- /dev/null +++ b/vllm_ascend/torchair/models/torchair_deepseek_v2.py @@ -0,0 +1,1049 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Adapted from +# # vllm-project/vllm/blob/main/vllm/model_executor/models/deepseek_v2.py +# # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# # vllm-project/vllm/vllm/model_executor/models/deepseek_v2.py +# """Inference-only DeepseekV2/DeepseekV3 model.""" + +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +import torch +import torch_npu +from torch import nn +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.config import (CacheConfig, ModelConfig, VllmConfig, + get_current_vllm_config) +from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, split_tensor_along_last_dim, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, + tensor_model_parallel_reduce_scatter) +from vllm.distributed.parallel_state import get_dp_group, get_ep_group +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, + UnquantizedLinearMethod) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.deepseek_v2 import \ + DeepseekV2ForCausalLM # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import \ + yarn_get_mscale # noqa: E501 +from vllm.model_executor.models.deepseek_v2 import ( + DeepseekV2Attention, DeepseekV2DecoderLayer, DeepseekV2MLAAttention, + get_spec_layer_idx_from_weight_name) +from vllm.model_executor.models.utils import ( + PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.quantization.quant_config import AscendLinearMethod +from vllm_ascend.torchair.ops.torchair_fused_moe import TorchairAscendFusedMoE +from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import \ + TorchairAscendW8A8DynamicLinearMethod +from vllm_ascend.utils import dispose_tensor, npu_prefetch + + +class TorchairDeepseekV2SiluAndMul(SiluAndMul): + + def __init__(self, + *, + weight_scale: Optional[Callable[[], torch.Tensor]] = None): + super().__init__() + self.weight_scale = weight_scale + + def forward_oot(self, x: Union[torch.Tensor, Tuple[torch.Tensor, + torch.Tensor]]): + if isinstance(x, tuple): + assert self.weight_scale is not None + # For AscendW8A8DynamicLinearMethod: + # a dynamic scale is passed along with the quantized value. + quantized_x, dynamic_scale = x + return torch_npu.npu_dequant_swiglu_quant( + x=quantized_x, + weight_scale=self.weight_scale(), + activation_scale=dynamic_scale, + activate_left=True, + quant_mode=1) + else: + return super().forward_oot(x) + + +class TorchairDeepseekV2MergedReplicatedLinear(ReplicatedLinear): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + self.output_sizes = output_sizes + super().__init__(input_size, + sum(output_sizes), + bias=bias, + quant_config=quant_config, + prefix=prefix) + + def weight_loader(self, param: torch.nn.Parameter, + loaded_weight: torch.Tensor, loaded_shard_id: int): + # With no support for GGUF format yet. + assert not getattr(param, "is_gguf_weight", False) + assert not getattr(param, "is_gguf_weight_type", False) + + assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + shard = param.data.narrow(param.output_dim, shard_offset, shard_size) + + assert shard.size() == loaded_weight.size(), ( + f"Tried to load weights of size {loaded_weight.size()}" + f"to a parameter shard of id {loaded_shard_id} size {shard.size()}" + ) + shard.copy_(loaded_weight) + + +class TorchairDeepseekV2RowParallelLinearReplaceAllreduce(RowParallelLinear): + + def forward( + self, + input_, + is_prefill=True, + is_force_scatter=False + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]: + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + num_tokens = output_parallel.shape[0] + if is_force_scatter and num_tokens % self.tp_size: + output_parallel = nn.functional.pad( + output_parallel, (0, 0, 0, -num_tokens % self.tp_size)) + if is_force_scatter or (not is_prefill + and output_parallel.shape[0] % self.tp_size + == 0): + output = tensor_model_parallel_reduce_scatter(output_parallel, + dim=0) + else: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class TorchairDeepseekV2RowParallelLinear(RowParallelLinear): + + def forward( + self, + input_, + is_prefill=True, + is_force_scatter=False + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[nn.Parameter]]]: + if self.input_is_parallel: + input_parallel = input_ + else: + tp_rank = get_tensor_model_parallel_rank() + splitted_input = split_tensor_along_last_dim( + input_, num_partitions=self.tp_size) + input_parallel = splitted_input[tp_rank].contiguous() + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output_parallel = self.quant_method.apply(self, + input_parallel, + bias=bias_) + if self.reduce_results and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output_parallel) + else: + output = output_parallel + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class TorchairDeepseekV2MLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + force_replicate: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + if not force_replicate: + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + else: + self.gate_up_proj = TorchairDeepseekV2MergedReplicatedLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = ReplicatedLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + + quant_method = self.gate_up_proj.quant_method + if isinstance(quant_method, UnquantizedLinearMethod): + self.act_fn = TorchairDeepseekV2SiluAndMul() + elif (isinstance(quant_method, AscendLinearMethod) + and isinstance(quant_method.quant_method, + TorchairAscendW8A8DynamicLinearMethod)): + # TODO(sdmyzlp): Currently preserved as before: + # 1. The only quantization supported for silu is W8A8Dynamic + # 2. Output dtype of gate_up/down is fixed to be int32/bfloat16 + # + # Maybe one can implement a better and more general configuration + # scheme, e.g. by somehow passing around the tweaked `quant_config` + self.act_fn = TorchairDeepseekV2SiluAndMul( + # Use lazy binding, for `weight_scale_fp32` is accessible + # only after `process_weights_after_loading`. + weight_scale=lambda: self.gate_up_proj.weight_scale_fp32) + # To be consumed by AscendW8A8DynamicLinearMethod.apply() + self.gate_up_proj._ascend_quant_config = { + "output_dtype": torch.int32, + "pertoken_scale": False, + "return_scale": True, + } + self.down_proj._ascend_quant_config = { + "output_dtype": torch.bfloat16, + "pertoken_scale": True, + "return_scale": False, + } + else: + raise NotImplementedError( + f"Quantization with [{type(quant_method)}] is NOT supported") + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class TorchairDeepseekV2MoE(nn.Module): + + top_k: int + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + self.n_shared_experts = config.n_shared_experts + if self.tp_size > config.n_routed_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.n_routed_experts}.") + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and \ + self.torchair_graph_enabled + + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + if config.topk_method == "noaux_tc": + self.gate.e_score_correction_bias = nn.Parameter( + torch.empty(config.n_routed_experts)) + else: + self.gate.e_score_correction_bias = None + + self.experts = TorchairAscendFusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func=config.scoring_func, + e_score_correction_bias=self.gate.e_score_correction_bias) + + if config.n_shared_experts is not None: + self.all_reduce_merge = self.experts.all_reduce_merge + reduce_results = not self.all_reduce_merge + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + self.shared_experts = TorchairDeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=reduce_results, + force_replicate=self.enable_multistream_moe + or enable_shared_expert_dp, + prefix=f"{prefix}.shared_experts", + ) + else: + self.shared_experts = None # type: ignore + TorchairDeepseekV2MoE.top_k = config.num_experts_per_tok + + self.dp_size = get_dp_group().world_size + + self.tp_group = get_tp_group().device_group + self.tp_rank = get_tp_group().rank_in_group + self.ep_group = get_ep_group() + self.kv_consumer = None + transfer_config = get_current_vllm_config().kv_transfer_config + if transfer_config is not None: + self.kv_consumer = transfer_config.kv_role == "kv_consumer" + + self.params_dtype = torch.get_default_dtype() + self.rm_router_logits = self.experts.rm_router_logits + + def forward(self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None, + replace_allreduce: bool = False) -> torch.Tensor: + + forward_context = get_forward_context() + # when profile runs, force experts to load balanced tokens + # to avoid high memory consumption on a single rank. + + enable_force_load_balance = forward_context.in_profile_run + + is_prefill = forward_context.with_prefill + + # If this node is kv_consumer, we force the moe always runs in decode path to make sure + # the behaviour aligned between dummy_run and normal model_execute. + if self.kv_consumer: + is_prefill = False + enable_force_load_balance = False + + # router_logits: (num_tokens, n_experts) + router_logits = None + if not self.rm_router_logits and not self.enable_multistream_moe: + router_logits, _ = self.gate(hidden_states) + + experts_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + is_prefill=is_prefill, + top_k=TorchairDeepseekV2MoE.top_k, + enable_force_load_balance=enable_force_load_balance, + shared_experts=self.shared_experts, + gate=self.gate, + replace_allreduce=replace_allreduce) + + hidden_states = ( + experts_hidden_states[0] * self.routed_scaling_factor + + experts_hidden_states[1]) + if self.all_reduce_merge: + # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce + hidden_states = tensor_model_parallel_all_reduce(hidden_states) + + return hidden_states + + +class TorchairDeepseekV2MLAAttention(DeepseekV2MLAAttention): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + qk_nope_head_dim: int, + qk_rope_head_dim: int, + v_head_dim: int, + q_lora_rank: Optional[int], + kv_lora_rank: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + nn.Module.__init__(self) + self.hidden_size = hidden_size + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim + self.v_head_dim = v_head_dim + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + + self.num_heads = num_heads + self.tp_size = get_tensor_model_parallel_world_size() + assert num_heads % self.tp_size == 0 + self.num_local_heads = num_heads // self.tp_size + self.layers = config.num_hidden_layers + self.first_k_dense_replace = config.first_k_dense_replace + + self.scaling = self.qk_head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.prefix = prefix + self.debug_layer_idx = int(self.prefix.split(".")[-2]) + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_mla = \ + ascend_config.torchair_graph_config.enable_multistream_mla + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + if self.q_lora_rank is not None: + self.q_a_proj = ReplicatedLinear(self.hidden_size, + self.q_lora_rank, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_a_proj") + self.q_a_layernorm = RMSNorm(self.q_lora_rank, + eps=config.rms_norm_eps) + self.q_b_proj = ColumnParallelLinear(q_lora_rank, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_b_proj") + else: + self.q_proj = ColumnParallelLinear(self.hidden_size, + self.num_heads * + self.qk_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.q_proj") + + self.kv_a_proj_with_mqa = ReplicatedLinear( + self.hidden_size, + self.kv_lora_rank + self.qk_rope_head_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_a_proj_with_mqa") + self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, + eps=config.rms_norm_eps) + self.kv_b_proj = ColumnParallelLinear( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim), + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_b_proj") + if (config.n_routed_experts is not None + and self.debug_layer_idx >= config.first_k_dense_replace + and self.debug_layer_idx % config.moe_layer_freq == 0 + and (ascend_config.torchair_graph_config.enable_multistream_moe + or self.enable_shared_expert_dp)): + self.o_proj = TorchairDeepseekV2RowParallelLinearReplaceAllreduce( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + else: + self.o_proj = TorchairDeepseekV2RowParallelLinear( + self.num_heads * self.v_head_dim, + self.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + if rope_scaling: + rope_scaling["rope_type"] = 'deepseek_yarn' + self.rotary_emb = get_rope(qk_rope_head_dim, + rotary_dim=qk_rope_head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=False) + if rope_scaling: + mscale_all_dim = rope_scaling.get("mscale_all_dim", False) + scaling_factor = rope_scaling["factor"] + mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) + self.scaling = self.scaling * mscale * mscale + + # In the MLA backend, kv_cache includes both k_c and + # pe (i.e. decoupled position embeddings). In particular, + # the concat_and_cache_mla op requires + # k_c.size(1) + k_pe.size(1) == kv_cache.size(2) + # i.e. + # kv_lora_rank + qk_rope_head_dim == head_size + self.mla_attn = Attention( + num_heads=self.num_local_heads, + head_size=self.kv_lora_rank + self.qk_rope_head_dim, + scale=self.scaling, + num_kv_heads=1, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + use_mla=True, + # MLA Args + q_lora_rank=self.q_lora_rank, + kv_lora_rank=self.kv_lora_rank, + qk_nope_head_dim=self.qk_nope_head_dim, + qk_rope_head_dim=self.qk_rope_head_dim, + qk_head_dim=self.qk_head_dim, + v_head_dim=self.v_head_dim, + rotary_emb=self.rotary_emb, + q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj, + kv_a_proj_with_mqa=self.kv_a_proj_with_mqa, + kv_a_layernorm=self.kv_a_layernorm, + kv_b_proj=self.kv_b_proj, + o_proj=self.o_proj, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + forward_context = get_forward_context() + enable_multistream_mla = (self.enable_multistream_mla + and attn_metadata is not None + and not forward_context.with_prefill + and attn_metadata.num_decodes > 0) + forward_kwargs = {"enable_multistream_mla": enable_multistream_mla} + if self.q_lora_rank is not None: + npu_prefetch(self.q_a_proj.weight, + hidden_states, + enabled=enable_multistream_mla) + ckq = self.q_a_proj(hidden_states)[0] + hidden_states_or_q_c = self.q_a_layernorm(ckq) + forward_kwargs['ckq'] = ckq + else: + hidden_states_or_q_c = hidden_states + if self.torchair_graph_enabled: + output_shape = hidden_states.shape + output = torch.empty(output_shape, + dtype=hidden_states_or_q_c.dtype, + device=hidden_states_or_q_c.device) + forward_kwargs['output'] = output + output = self.mla_attn.impl.forward(self.mla_attn, + hidden_states_or_q_c, + hidden_states, None, kv_cache, + attn_metadata, + **forward_kwargs) + output = output.view(-1, output_shape[-1]) + return output + else: + kv_no_split = self.kv_a_proj_with_mqa(hidden_states)[0] + if self.enable_shared_expert_dp and self.debug_layer_idx > self.first_k_dense_replace and self.debug_layer_idx < self.layers: + hidden_states_or_q_c = get_tp_group().all_gather( + hidden_states_or_q_c, 0) + kv_no_split = get_tp_group().all_gather(kv_no_split, 0) + + kv_c, k_pe = kv_no_split.split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + if not self.enable_shared_expert_dp or self.debug_layer_idx < self.first_k_dense_replace: + output_shape = hidden_states.shape + else: + num_tokens = hidden_states_or_q_c.shape[0] + rows = num_tokens // self.tp_size + if num_tokens % self.tp_size: + rows += 1 + output_shape = (rows, hidden_states.shape[1]) + return self.mla_attn(hidden_states_or_q_c, + kv_c_normed, + k_pe, + output_shape=output_shape) + + +class TorchairDeepseekV2DecoderLayer(DeepseekV2DecoderLayer): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + model_config: ModelConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + nn.Module.__init__(self) + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep='.')[-1]) + self.layer_idx = layer_idx + self.layers = config.num_hidden_layers + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tp_group().rank_in_group + ascend_config = get_ascend_config() + # TODO: enable mla in vllm-ascend + if model_config.use_mla: + attn_cls = TorchairDeepseekV2MLAAttention + else: + attn_cls = DeepseekV2Attention + self.self_attn = attn_cls( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + qk_nope_head_dim=config.qk_nope_head_dim, + qk_rope_head_dim=config.qk_rope_head_dim, + v_head_dim=config.v_head_dim, + q_lora_rank=config.q_lora_rank + if hasattr(config, "q_lora_rank") else None, + kv_lora_rank=config.kv_lora_rank, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): + self.mlp = TorchairDeepseekV2MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \ + and model_config.use_mla and self.tp_size > 1 + else: + self.mlp = TorchairDeepseekV2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.mla_moe_communication = False + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.routed_scaling_factor = config.routed_scaling_factor + self.first_k_dense_replace = config.first_k_dense_replace + self.tp_group = get_tp_group().device_group + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + replace_allreduce: bool = False, + ) -> torch.Tensor: + # Self Attention + if attn_metadata is not None and attn_metadata.num_decodes > 0: + mla_moe_communication = self.mla_moe_communication and replace_allreduce + else: + mla_moe_communication = False + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + previous_hidden_states, previous_residual = hidden_states, residual + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + # Dispose hidden_states and residual from the previous layer + # to save npu memory because they're no longer used. + dispose_tensor(previous_hidden_states) + dispose_tensor(previous_residual) + if mla_moe_communication and self.layer_idx > self.first_k_dense_replace: + hidden_states = tensor_model_parallel_all_gather(hidden_states, + dim=0) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if mla_moe_communication and residual.shape[0] != hidden_states.shape[ + 0]: + chunk_hidden_states = torch.tensor_split(residual, + self.tp_size, + dim=0) + residual = chunk_hidden_states[self.tp_rank] + + if hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # We scale both hidden_states and residual before + # rmsnorm, and rmsnorm result would not affect by scale. + hidden_states *= 1. / self.routed_scaling_factor + if self.layer_idx == 0: + # The residual is shared by all layers, we only scale it on + # first layer. + residual *= 1. / self.routed_scaling_factor + + tp_size = get_tensor_model_parallel_world_size() + if self.enable_shared_expert_dp and ( + self.layer_idx == self.first_k_dense_replace + or self.layer_idx == self.layers) and tp_size > 1: + num_tokens, _ = residual.shape + if num_tokens % tp_size: + residual = nn.functional.pad(residual, + (0, 0, 0, -num_tokens % tp_size)) + chunk_residual = torch.tensor_split(residual, tp_size, dim=0) + tp_rank = get_tensor_model_parallel_rank() + residual = chunk_residual[tp_rank] + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if isinstance(self.mlp, TorchairDeepseekV2MoE): + hidden_states = self.mlp(hidden_states, + attn_metadata, + replace_allreduce=mla_moe_communication) + else: + hidden_states = self.mlp(hidden_states) + + if isinstance(self.mlp, TorchairDeepseekV2MLP + ) and hidden_states.dtype == torch.float16: + # Fix FP16 overflow + # Scaling the DeepseekV2MLP output, it is the input of + # input_layernorm of next decoder layer. + # The scaling of DeepseekV2MOE output would be done in the forward + # of DeepseekV2MOE + hidden_states *= 1. / self.routed_scaling_factor + if mla_moe_communication and self.layer_idx == self.layers - 1: + hidden_states = tensor_model_parallel_all_gather(hidden_states, + dim=0) + residual = tensor_model_parallel_all_gather(residual, dim=0) + + # for last layer of main model and mtp layer. + if self.enable_shared_expert_dp and self.layer_idx >= ( + self.layers - 1) and tp_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + residual = get_tp_group().all_gather(residual, 0) + + attn_metadata = get_forward_context().attn_metadata + if attn_metadata is not None: + num_tokens = attn_metadata.num_actual_tokens + else: + num_tokens = hidden_states.shape[0] + + if num_tokens < hidden_states.shape[0]: + hidden_states = hidden_states[:num_tokens] + residual = residual[:num_tokens] + + return hidden_states, residual + + +class TorchairDeepseekV2Model(nn.Module): + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.tp_size = get_tensor_model_parallel_world_size() + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: TorchairDeepseekV2DecoderLayer( + config, + prefix, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + ), + prefix=f"{prefix}.layers") + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + replace_allreduce = hidden_states.shape[0] % self.tp_size == 0 + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata, + replace_allreduce=replace_allreduce) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class TorchairDeepseekV2ForCausalLM(DeepseekV2ForCausalLM): + # add `packed_modules_mapping` in `DeepseekV2ForCausalLM` to support weight merging + packed_modules_mapping = { + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + nn.Module.__init__(self) + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = TorchairDeepseekV2Model(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) + else: + self.lm_head = PPMissingLayer() + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + # NOTE: This `load_weights` is mainly copied from + # https://github.com/vllm-project/vllm/commit/07b8fae219b1fff51ef115c38c44b51395be5bb5 + # to fix CI, and it is different from the implementation in main + # TODO: support eplb style load_weights + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + """""" + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = TorchairAscendFusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "module" in name: + continue + + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue # skip spec decode layers for main model + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id, + return_success=False) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states diff --git a/vllm_ascend/torchair/models/torchair_deepseek_v3.py b/vllm_ascend/torchair/models/torchair_deepseek_v3.py new file mode 100644 index 0000000..aef8ae0 --- /dev/null +++ b/vllm_ascend/torchair/models/torchair_deepseek_v3.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from vllm_ascend.torchair.models.torchair_deepseek_v2 import \ + TorchairDeepseekV2ForCausalLM + + +class TorchairDeepseekV3ForCausalLM(TorchairDeepseekV2ForCausalLM): + pass diff --git a/vllm_ascend/torchair/models/torchair_pangu_moe.py b/vllm_ascend/torchair/models/torchair_pangu_moe.py new file mode 100644 index 0000000..eb05760 --- /dev/null +++ b/vllm_ascend/torchair/models/torchair_pangu_moe.py @@ -0,0 +1,1119 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union + +import torch +import torch.distributed as dist +import torch.nn.functional as F +import torch_npu +from torch import nn +from torch.nn import Parameter +from transformers import PretrainedConfig +from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import (divide, get_pp_group, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group, get_world_group) +from vllm.forward_context import get_forward_context +from vllm.logger import logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearBase, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.utils import ( + extract_layer_index, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_weight_attrs +from vllm.sequence import IntermediateTensors + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_310p + +_ROUTER_SCALE = None + + +def use_h2p(): + # only use H2P when dp_size > 1. + if get_dp_group().world_size > 1: + return True + return False + + +# This class is adapted from vllm.model_executor.layers.linear.MergedColumnParallelLinear. +# It is used to customize parallelism of certain linear(e.g., shared experts with all-rank tp). +class CustomMergedColumnParallelLinear(LinearBase): + + def __init__( + self, + input_size: int, + output_sizes: list[int], + bias: bool = True, + gather_output: bool = False, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + ): + # Divide the weight matrix along the last dimension. + output_size = sum(output_sizes) + self.output_sizes = output_sizes + self.tp_size = get_tp_group().world_size + self.input_size_per_partition = input_size + self.output_size_per_partition = divide(output_size, self.tp_size) + self.output_partition_sizes = [self.output_size_per_partition] + # If QKV or MergedColumn, use output size of each partition. + if hasattr(self, "output_sizes"): + self.output_partition_sizes = [ + divide(output_size, self.tp_size) + for output_size in self.output_sizes + ] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.gather_output = gather_output + + if output_sizes is None: + output_sizes = [output_size] + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) + if bias: + self.bias = Parameter( + torch.empty(self.output_size_per_partition, + dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor, + loaded_shard_id: int): + param_data = param.data + output_dim = getattr(param, "output_dim", None) + + assert loaded_shard_id < len(self.output_sizes) + + tp_rank = get_tp_group().rank_in_group + tp_size = get_tp_group().world_size + if output_dim is not None: + shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size + shard_size = self.output_sizes[loaded_shard_id] // tp_size + + is_sharded_weight = getattr(param, "is_sharded_weight", False) + param_data = param_data.narrow(output_dim, shard_offset, + shard_size) + start_idx = tp_rank * shard_size + if not is_sharded_weight: + loaded_weight = loaded_weight.narrow(output_dim, start_idx, + shard_size) + else: + ignore_warning = getattr(param, "ignore_warning", False) + if not ignore_warning: + logger.warning( + "Loading a weight without `output_dim` attribute in " + "MergedColumnParallelLinear, assume the weight is " + "the same for all partitions.") + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + bias = self.bias if not self.skip_bias_add else None + + # Matrix multiply. + assert self.quant_method is not None + output_parallel = self.quant_method.apply(self, input_, bias) + output = output_parallel + output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output + return output, output_bias + + +# This class is adapted from vllm.model_executor.layers.linear.RowParallelLinear. +# It is used to customize parallelism of certain linear(e.g., shared experts with all-rank tp) +# and detach communication to enable customized communication algorithms(e.g., H2P). +class CustomRowParallelLinear(LinearBase): + + def __init__( + self, + input_size: int, + output_size: int, + bias: bool = True, + input_is_parallel: bool = True, + skip_bias_add: bool = False, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + *, + return_bias: bool = True, + group=None, + ): + # Divide the weight matrix along the first dimension. + self.group = group if group is not None else get_tp_group() + self.tp_rank = self.group.rank_in_group + self.tp_size = self.group.world_size + self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + + super().__init__(input_size, + output_size, + skip_bias_add, + params_dtype, + quant_config, + prefix, + return_bias=return_bias) + + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + + assert self.quant_method is not None + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.input_size_per_partition, + output_partition_sizes=self.output_partition_sizes, + input_size=self.input_size, + output_size=self.output_size, + params_dtype=self.params_dtype, + weight_loader=self.weight_loader) + if not reduce_results and (bias and not skip_bias_add): + raise ValueError("When not reduce the results, adding bias to the " + "results can lead to incorrect results") + + if bias: + self.bias = Parameter( + torch.empty(self.output_size, dtype=params_dtype)) + set_weight_attrs(self.bias, { + "output_dim": 0, + "weight_loader": self.weight_loader, + }) + else: + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + tp_rank = self.group.rank_in_group + input_dim = getattr(param, "input_dim", None) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + is_sharded_weight = is_sharded_weight + + param_data = param.data + if input_dim is not None and not is_sharded_weight: + shard_size = param_data.shape[input_dim] + start_idx = tp_rank * shard_size + loaded_weight = loaded_weight.narrow(input_dim, start_idx, + shard_size) + + # Special case for loading scales off disk, which often do not + # have a shape (such as in the case of AutoFP8). + if len(loaded_weight.shape) == 0: + loaded_weight = loaded_weight.reshape(1) + + assert param_data.shape == loaded_weight.shape + param_data.copy_(loaded_weight) + + def forward( + self, input_ + ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]: + input_parallel = input_ + + # Matrix multiply. + assert self.quant_method is not None + # Only fuse bias add into GEMM for rank 0 (this ensures that + # bias will not get added more than once in TP>1 case) + bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias + output = self.quant_method.apply(self, input_parallel, bias=bias_) + + output_bias = self.bias if self.skip_bias_add else None + + if not self.return_bias: + return output + return output, output_bias + + +class PanguProMoEMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + if not use_h2p(): + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + else: + self.gate_up_proj = CustomMergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = CustomRowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj", + ) + + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +def topk_wrapper(num_voted_experts): + + def pangu_group8_topk( + hidden_states: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool = False, + num_expert_group: int = 0, + topk_group: int = 0, + global_num_experts: int = 0, + ): + scores = F.softmax(gating_output, dim=1) + num_tokens = scores.shape[0] + router_scale = _ROUTER_SCALE.squeeze( # type: ignore + ) + # TODO: support disable expert parallel + ep_size = get_ep_group().world_size + local_num_experts = global_num_experts // ep_size + local_num_group = topk // ep_size + experts_per_group = global_num_experts // topk + local_group_start = get_ep_group().rank_in_group * local_num_experts + local_group_end = (get_ep_group().rank_in_group + + 1) * local_num_experts + scores = F.softmax(gating_output, dim=1) + scores = scores[..., local_group_start:local_group_end] + + router_weights = router_scale[local_group_start:local_group_end] + + if num_voted_experts == 8: + # use original topk + topk_weights, topk_ids = torch.max(scores.view( + scores.shape[0], local_num_group, -1), + dim=-1) + bias = torch.arange(0, + local_num_experts, + experts_per_group, + device=scores.device, + dtype=torch.int32).unsqueeze(0) + topk_ids = topk_ids.to(torch.int32) + bias + + else: + group_expert_indices = torch.arange(experts_per_group, + dtype=torch.int32, + device=scores.device).view( + 1, 1, -1) + group_expert_offset = (torch.arange( + local_num_group, dtype=torch.int32, device=scores.device) * + experts_per_group).unsqueeze(0) + expert_index_range = torch.arange(experts_per_group, + dtype=torch.int32, + device=scores.device) + + scores_grouped = scores.view(num_tokens, local_num_group, + experts_per_group) + best_expert_idx = torch.argmax(scores_grouped, + dim=2) # (num_tokens, num_groups) + vote_mask = (best_expert_idx.unsqueeze(-1).to( + torch.int32) == group_expert_indices) + + expert_vote_freq = vote_mask.sum(dim=0) + + sorted_indices = torch.argsort(expert_vote_freq, + dim=1, + descending=True).to(torch.int32) + topk_experts = sorted_indices[:, :num_voted_experts] + keep_mask = (( + topk_experts.unsqueeze(-1) == expert_index_range).any( + dim=1)).unsqueeze(0) + + masked_scores = torch.where(keep_mask, scores_grouped, 0) + + topk_weights, best_pos_in_group = masked_scores.max(dim=2) + best_pos_in_group = best_pos_in_group.to(torch.int32) + topk_ids = (best_pos_in_group + group_expert_offset).to( + torch.int32) + + flatten_topk_ids = topk_ids.view(-1) + router_weights = router_weights.index_select(0, flatten_topk_ids).view( + topk_ids.shape) + topk_weights *= router_weights + + return topk_weights, topk_ids + + return pangu_group8_topk + + +class PanguProMoESparseMoeBlock(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.num_experts = config.num_experts + + if self.tp_size > config.num_experts: + raise ValueError( + f"Tensor parallel size {self.tp_size} is greater than " + f"the number of experts {config.num_experts}.") + + self.num_experts_per_tok = config.num_experts_per_tok + self.router_scale = torch.nn.Parameter( + torch.ones((1, self.num_experts))) + + # on 300I Duo platform, we find that num_voted_experts set to 5 achieves + # good performance without sacrifice too much accuracy. for other platform, + # this is set to 8 to use original pangu grouped topk. + num_voted_experts = 5 if is_310p() else 8 + + self.experts = FusedMoE( + num_experts=config.num_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + quant_config=quant_config, + custom_routing_function=topk_wrapper(num_voted_experts), + prefix=f"{prefix}.experts", + ) + self.use_ep = self.experts.use_ep + + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + if config.shared_expert_intermediate_size > 0: + self.shared_expert = PanguProMoEMLP( + hidden_size=config.hidden_size, + intermediate_size=config.shared_expert_intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=False, + prefix=f"{prefix}.shared_expert", + ) + else: + self.shared_expert = None # type: ignore + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: Optional[AttentionMetadata] = None) -> torch.Tensor: + # NOTE: hidden_states can have either 1D or 2D shape. + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + shared_output = None + if self.shared_expert is not None: + shared_output = self.shared_expert(hidden_states) + # router_logits: (num_tokens, n_experts) + router_logits, _ = self.gate(hidden_states) + global _ROUTER_SCALE + _ROUTER_SCALE = self.router_scale + + # TODO(angazenn): Does not support MC2 currently + get_forward_context().moe_comm_method_name = "allgathercommimpl" + + if not use_h2p(): + final_hidden_states = self.experts.forward_impl( + hidden_states=hidden_states, router_logits=router_logits) + else: + # TODO: when using h2p, we have to skip communication in vLLM + # native FusedMoE. here we need to design a better FusedMoE + # (maybe using AscendFusedMoE) to enable these different + # communication schema. + final_hidden_states = self.experts.quant_method.apply( + layer=self.experts, + x=hidden_states, + router_logits=router_logits, + top_k=self.experts.top_k, + renormalize=False, + use_grouped_topk=False, + global_num_experts=self.experts.global_num_experts, + expert_map=self.experts.expert_map, + custom_routing_function=self.experts.custom_routing_function, + apply_router_weight_on_input=self.experts. + apply_router_weight_on_input) + + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + if not use_h2p(): + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + return final_hidden_states.view(num_tokens, hidden_dim) + + +class PanguProMoEAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[Dict[str, Any]] = None, + max_position_embeddings: int = 8192, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + if use_h2p(): + self.o_proj = CustomRowParallelLinear(self.total_num_heads * + self.head_dim, + hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + group=get_tp_group()) + else: + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + q, k = self.rotary_emb(positions, q, k) + if self.torchair_graph_enabled: + forward_kwargs = {'trace_flag': False} + output_shape = q.shape + attn_output = torch.empty(output_shape, + dtype=q.dtype, + device=q.device) + forward_kwargs['output'] = attn_output + attn_output = self.attn.impl.forward(self.attn, q, k, v, kv_cache, + attn_metadata, + **forward_kwargs) + else: + attn_output = self.attn(q, k, v) + + output, _ = self.o_proj(attn_output) + return output + + +class PanguProMoEDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + + self.self_attn = PanguProMoEAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + + # `mlp_only_layers` in the config. + layer_idx = extract_layer_index(prefix) + mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else + config.mlp_only_layers) + if (layer_idx not in mlp_only_layers) and (config.num_experts > 0): + self.mlp = PanguProMoESparseMoeBlock( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + else: + self.mlp = PanguProMoEMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + kv_cache: Optional[torch.Tensor] = None, + attn_metadata: Optional[AttentionMetadata] = None, + h2p_unpad_idx: Optional[torch.Tensor] = None, + h2p_pad_idx: Optional[torch.Tensor] = None, + is_start_layer: Optional[bool] = False, + ) -> torch.Tensor: + need_h2p_pad = h2p_unpad_idx is not None and h2p_pad_idx is not None \ + and h2p_unpad_idx.shape[0] < h2p_pad_idx.shape[0] + tp_size = get_tp_group().world_size + + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + if use_h2p(): + if is_start_layer: + if need_h2p_pad: + residual = residual.index_select(dim=0, index=h2p_pad_idx) + residual = torch.tensor_split( + residual, tp_size)[get_tp_group().rank_in_group] + else: + if tp_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + if need_h2p_pad: + hidden_states = hidden_states.index_select( + dim=0, index=h2p_unpad_idx) + + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + if use_h2p(): + if need_h2p_pad: + hidden_states = hidden_states.index_select(dim=0, + index=h2p_pad_idx) + if tp_size > 1: + hidden_states = dist._functional_collectives.reduce_scatter_tensor( + hidden_states, + "sum", + scatter_dim=0, + group=get_tp_group().device_group) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + + if use_h2p(): + all_rank_group = get_world_group().device_group + output_size = (hidden_states.shape[0] * + get_world_group().world_size, + hidden_states.shape[1]) + # Allocate output tensor. + output_tensor = torch.empty(output_size, + dtype=hidden_states.dtype, + device=hidden_states.device) + # All-gather. + dist.all_gather_into_tensor(output_tensor, + hidden_states, + group=all_rank_group) + hidden_states = output_tensor + + hidden_states = self.mlp(hidden_states, attn_metadata=attn_metadata) + + if use_h2p(): + hidden_states = dist._functional_collectives.reduce_scatter_tensor( + hidden_states, + "sum", + scatter_dim=0, + group=get_world_group().device_group) + + return hidden_states, residual + + +@support_torch_compile +class PanguProMoEModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: PanguProMoEDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + if use_h2p(): + # calculate necessary padding/unpadding idx before model forward. + + # the attn_metadata will be passed directly when use torchair. + # if attn_meatadata is not passed, we try to get it from forward_context. + if attn_metadata is None: + attn_metadata = get_forward_context().attn_metadata + + max_tokens_across_dp = get_forward_context().max_tokens_across_dp + + tp_size = get_tp_group().world_size + # reduce scatter will split the input tensor into equal sizes and then scatter them on all ranks. + # we need pad it before if the shape can't be divided by group size. + # for h2p, we need pad it so that it can be divided by tp_size. + h2p_padded_len = ( + tp_size - (max_tokens_across_dp % tp_size) + ) % tp_size + max_tokens_across_dp - hidden_states.shape[0] + h2p_unpad_idx = torch.arange(hidden_states.shape[0], + device=hidden_states.device, + dtype=torch.int32) + h2p_pad_idx = torch.cat([ + h2p_unpad_idx, + torch.zeros(h2p_padded_len, + dtype=torch.int32, + device=hidden_states.device) + ]) + else: + h2p_unpad_idx = None + h2p_pad_idx = None + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, hidden_states, residual, + kv_caches[i - + self.start_layer] if kv_caches is not None else None, + attn_metadata, h2p_unpad_idx, h2p_pad_idx, + i == self.start_layer) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + if use_h2p(): + if get_tp_group().world_size > 1: + hidden_states = get_tp_group().all_gather(hidden_states, 0) + if h2p_unpad_idx.shape[0] < h2p_pad_idx.shape[0]: + hidden_states = hidden_states.index_select(dim=0, + index=h2p_unpad_idx) + return hidden_states + + +class PanguProMoEForCausalLM(nn.Module, SupportsPP): + + fall_back_to_pt_during_load = False + + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "experts": + ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = PanguProMoEModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head", + ) + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: Optional[List[torch.Tensor]] = None, + attn_metadata: Optional[AttentionMetadata] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: Optional[torch.Tensor], + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + tp_size = get_tp_group().world_size + tp_rank = get_tp_group().rank_in_group + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts) + + params_dict = dict(self.named_parameters()) # from model + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + # ======================================================= + # BF: add this to load with less layers + if 'layers' in name: + layer_idx = int(name.split('layers.')[-1].split('.')[0]) + if layer_idx >= self.model.end_layer: + continue + + if "rotary_emb.inv_freq" in name: + continue + + if "module" in name: + continue + + if name.endswith('kv_cache_offset'): + continue + + if name.endswith("k_proj.kv_cache_scale"): + remapped_kv_scale_name = name.replace( + "k_proj.kv_cache_scale", "attn.key_antiquant_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + loaded_weight = torch.tensor_split(loaded_weight, + tp_size, + dim=0)[tp_rank] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + if name.endswith("v_proj.kv_cache_scale"): + remapped_kv_scale_name = name.replace( + "v_proj.kv_cache_scale", "attn.value_antiquant_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + loaded_weight = torch.tensor_split(loaded_weight, + tp_size, + dim=0)[tp_rank] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + # breakpoint() + name = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + param = params_dict[name] + weight_loader = param.weight_loader + # breakpoint() + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if ((name.endswith(".bias") or name.endswith("_bias")) + and name not in params_dict): + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # Remapping the name of FP8 kv-scale. + if name.endswith("kv_scale"): + remapped_kv_scale_name = name.replace( + ".kv_scale", ".attn.kv_scale") + if remapped_kv_scale_name not in params_dict: + logger.warning_once( + "Found kv scale in the checkpoint " + f"(e.g. {name}), but not found the expected " + f"name in the model " + f"(e.g. {remapped_kv_scale_name}). " + "kv-scale is not loaded.") + continue + else: + name = remapped_kv_scale_name + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + if is_310p() and "head" in name: + # on 300I Duo platform, ACL_FORMAT_FRACTAL_NZ is much more preferred than + # ACL_FORMAT_FRACTAL_ND by matmul operation. Since lmhead is also implemented + # by linear, we manually cast the format here. + param.data = torch_npu.npu_format_cast(param.data, + ACL_FORMAT_FRACTAL_NZ) + return loaded_params diff --git a/vllm_ascend/torchair/ops/__init__.py b/vllm_ascend/torchair/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/torchair/ops/torchair_fused_moe.py b/vllm_ascend/torchair/ops/torchair_fused_moe.py new file mode 100644 index 0000000..bd2be21 --- /dev/null +++ b/vllm_ascend/torchair/ops/torchair_fused_moe.py @@ -0,0 +1,1321 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/kernels/test_moe.py + +import os +from typing import Any, Callable, Optional, Tuple, Union + +import torch +import torch.distributed as dist +import torch_npu +from torch import nn +from vllm.config import get_current_vllm_config +from vllm.distributed import (GroupCoordinator, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce) +from vllm.distributed.parallel_state import (get_dp_group, get_ep_group, + get_tp_group) +from vllm.forward_context import get_forward_context +from vllm.model_executor.layers.fused_moe.config import \ + FusedMoEConfig # isort: skip +from vllm.model_executor.layers.fused_moe.config import \ + FusedMoEParallelConfig # isort: skip +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, UnquantizedFusedMoEMethod, determine_expert_map) +from vllm.model_executor.layers.quantization.base_config import \ + QuantizationConfig + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.communication_op import \ + data_parallel_reduce_scatter +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer +from vllm_ascend.ops.sequence_parallel import MetadataForPadding +from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod +from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (AscendSocVersion, dispose_tensor, + get_all_reduce_merge_state, + get_ascend_soc_version, + get_rm_router_logits_state, is_310p) + + +def torchair_fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + moe_parallel_config: FusedMoEParallelConfig, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: Optional[str] = None, + shared_experts: Optional[Any] = None, + is_torchair: bool = False, + mc2_mask: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + quant_mode = 0 + ep_rank_id = moe_parallel_config.ep_rank + ep_world_size = moe_parallel_config.ep_size + + # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine + need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + or is_torchair) + + # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine + a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3 + + enable_dispatch_v2 = hasattr(torch_npu, "npu_moe_distribute_dispatch_v2") + + moe_expert_num = len(expert_map) + kwargs_mc2 = { + "x": hidden_states, + "expert_ids": topk_ids, + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + + stage1_kwargs = { + "scales": None, + "quant_mode": quant_mode, + "group_ep": moe_all_to_all_group_name, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, + } + if need_extra_args: + stage1_kwargs.update({ + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if a3_need_extra_args and enable_dispatch_v2: + stage1_kwargs.update({ + "x_active_mask": mc2_mask, + }) + + kwargs_mc2.update(stage1_kwargs) + + output = torch_npu.npu_moe_distribute_dispatch_v2( + **kwargs_mc2 + ) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_dispatch( + **kwargs_mc2) + # comm_stream.wait_stream(torch.npu.current_stream()) + expand_x, dynamic_scale, assist_info_for_combine, expert_token_nums, ep_recv_counts = output[ + 0:5] + + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(hidden_states, topk_weights) + shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states) + npu_wait_tensor(shared_gate_up, expand_x) + shared_act = shared_experts.act_fn(shared_gate_up) + + w1 = w1.transpose(1, 2) + + group_list = expert_token_nums.to(torch.int64) + gate_up_out_list = torch_npu.npu_grouped_matmul( + x=[expand_x], + weight=[w1], + split_item=2, + # 1 means count mode, to avoid cumulative operation of the group list + group_list_type=1, + group_type=0, + group_list=group_list, + )[0] + + gate_up_out = torch_npu.npu_swiglu(gate_up_out_list) + + w2 = w2.transpose(1, 2) + down_out_list = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=1, + group_type=0, + group_list=group_list, + )[0] + + # moeCombine + kwargs_mc2 = { + "expand_x": down_out_list, + "expert_ids": topk_ids, + "expert_scales": topk_weights.to(torch.float32), + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + tp_recv_counts = output[5] + stage3_kwargs = { + "ep_send_counts": ep_recv_counts, + "group_ep": moe_all_to_all_group_name, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, + } + if enable_dispatch_v2: + stage3_kwargs.update({ + "assist_info_for_combine": + assist_info_for_combine, + }) + else: + stage3_kwargs.update({ + "expand_idx": assist_info_for_combine, + }) + if need_extra_args: + stage3_kwargs.update({ + "tp_send_counts": tp_recv_counts, + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if a3_need_extra_args and enable_dispatch_v2: + stage3_kwargs.update({ + "x_active_mask": mc2_mask, + }) + kwargs_mc2.update(stage3_kwargs) + + hidden_states = torch_npu.npu_moe_distribute_combine_v2( + **kwargs_mc2 + ) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine( + **kwargs_mc2) + + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act, down_out_list) + shared_hidden_states, _ = shared_experts.down_proj(shared_act) + return hidden_states, shared_hidden_states + + +def torchair_apply_mlp( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + group_list: torch.Tensor, + group_list_type: int = 1, +) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + + Args: + hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + + Returns: + hidden_states: output hidden states after MLP. + """ + + w1 = w1.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + + hidden_states = torch_npu.npu_swiglu(hidden_states) + + w2 = w2.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + )[0] + + return hidden_states + + +# currently expert parallelism implemented with all2all +# is under-optimized. +def torchair_fused_experts_with_all2all( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + ep_group: GroupCoordinator = None, +): + original_shape = hidden_states.shape + if len(original_shape) == 3: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + num_tokens, _ = hidden_states.shape + num_experts = w1.shape[0] + device = hidden_states.device + + if expert_map is not None: + global_num_experts = len(expert_map) + local_num_experts = global_num_experts // ep_group.world_size + row_idx_len = num_tokens * top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=device).view(top_k, -1).permute( + 1, 0).contiguous()) + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + global_expert_tokens = torch.bincount(expanded_expert_idx, + minlength=global_num_experts) + scatter_sizes = global_expert_tokens.view(ep_group.world_size, + -1).sum(-1) + + gather_sizes = torch.empty_like(scatter_sizes) + dist.all_to_all_single(gather_sizes, + scatter_sizes, + group=ep_group.device_group) + scatter_size_list = scatter_sizes.cpu().tolist() + gather_size_list = gather_sizes.cpu().tolist() + + expanded_expert_idx = expanded_expert_idx % local_num_experts + hidden_states = ep_group.all_to_all(hidden_states, 0, 0, + scatter_size_list, + gather_size_list) + local_expert_idx = ep_group.all_to_all(expanded_expert_idx, 0, 0, + scatter_size_list, + gather_size_list) + + sorted_local_expert_idx, sorted_idx = torch.sort(local_expert_idx) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + sorted_local_expert_idx, local_num_experts).to(torch.int64) + + hidden_states = hidden_states[sorted_idx] + else: + row_idx_len = num_tokens * top_k + row_idx = torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=topk_weights.device).view( + top_k, -1).permute(1, 0).contiguous() + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, num_experts) + expert_tokens = expert_tokens.to(torch.int64) + + w1 = w1.transpose(1, 2) + gate_up_out_list = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + )[0] + + hidden_states = torch_npu.npu_swiglu(gate_up_out_list) + + w2 = w2.transpose(1, 2) + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + )[0] + + if expert_map is not None: + resorted_idx = torch.argsort(sorted_idx) + hidden_states = hidden_states[resorted_idx] + hidden_states = ep_group.all_to_all(hidden_states, 0, 0, + gather_size_list, + scatter_size_list) + + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + else: + # TODO: Reorder device memory 2 times here, replace the current + # implementation here when suitable operators become available. + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) + return final_hidden_states + + +def torchair_fused_experts_moge( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + moe_parallel_config: FusedMoEParallelConfig, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + global_num_experts: int, + expert_map: torch.Tensor = None, + apply_router_weight_on_input: bool = False, +) -> torch.Tensor: + """ + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size). + w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size). + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + top_k: Number of experts to select. + expert_map: Expert mapping of shape (num_experts,). + + Returns: + hidden_states: Hidden states after routing. + """ + ep_size = moe_parallel_config.ep_size + local_num_experts = global_num_experts // ep_size + local_num_group = top_k // ep_size + + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) + + bsz, _ = hidden_states.shape + flatten_topk_ids = topk_ids.view(-1) + sorted_topk_ids = torch.argsort(flatten_topk_ids.float()) + sorted_topk_ids = sorted_topk_ids.to(torch.int32) + sorted_hidden_states = hidden_states.index_select( + 0, sorted_topk_ids // local_num_group) + + experts_id = torch.arange(0, + local_num_experts, + dtype=topk_ids.dtype, + device=topk_ids.device) + num_tokens_per_expert = (flatten_topk_ids.unsqueeze(-1) == experts_id).to( + torch.float32).sum(0) + topk_scales = topk_weights.view(-1).index_select( + 0, sorted_topk_ids).unsqueeze(-1) + group_list = num_tokens_per_expert.cumsum(dim=0).to(torch.int64) + + w1 = w1.transpose(1, 2) + gate_up_out = torch_npu.npu_grouped_matmul( + x=[sorted_hidden_states], + weight=[w1], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + if is_310p(): + gate_up_out = torch_npu.npu_swiglu(gate_up_out.to(torch.float32)).to( + torch.float16) + else: + gate_up_out = torch_npu.npu_swiglu(gate_up_out) + gate_up_out *= topk_scales + + w2 = w2.transpose(1, 2) + down_out_list = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=0, + group_type=0, + group_list=group_list, + )[0] + + unsorted_topk_ids = torch.argsort(sorted_topk_ids.float()).to(torch.int32) + unsorted_hidden_states = down_out_list.index_select(0, unsorted_topk_ids) + final_hidden_states = unsorted_hidden_states.reshape( + bsz, top_k // ep_size, -1).sum(1) + + return final_hidden_states + + +def torchair_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + apply_router_weight_on_input: bool = False, + max_num_tokens: Optional[int] = None, +) -> torch.Tensor: + """ + Fused experts with top-k routing. + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + w1: Expert weights1 of shape (num_experts, intermediate_size * 2, hidden_size). + w2: Expert weights2 of shape (num_experts, hidden_size, intermediate_size). + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + top_k: Number of experts to select. + expert_map: Expert mapping of shape (num_experts,). + + Returns: + hidden_states: Hidden states after routing. + """ + """ + # Check constraints. + assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" + assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" + assert w1.is_contiguous(), "Expert weights1 must be contiguous" + assert w2.is_contiguous(), "Expert weights2 must be contiguous" + """ + # if torch.distributed.get_rank() == 0: + # print(w1.shape) + # print(hidden_states.shape) + + original_shape = hidden_states.shape + # assert len(original_shape) == 2 + + num_tokens = hidden_states.shape[:-1].numel() + num_experts = w1.shape[0] + dtype = hidden_states.dtype + device = hidden_states.device + # assert dtype in [torch.float32, torch.float16, torch.bfloat16 + # ], "Only float32, float16, and bfloat16 are supported" + + if apply_router_weight_on_input: + assert (topk_weights.dim() == 2 + ), "`topk_weights` should be in shape (num_tokens, topk)" + _, topk = topk_weights.shape + assert ( + topk == 1 + ), "Only support topk=1 when `apply_router_weight_on_input` is True" + hidden_states = hidden_states * topk_weights.to(hidden_states.dtype) + + if expert_map is not None: + # Generate token indices and flatten + token_indices = (torch.arange(num_tokens, + device=device, + dtype=torch.int64).unsqueeze(1).expand( + -1, top_k).reshape(-1)) + + # Flatten token-to-expert mappings and map to local experts + weights_flat = topk_weights.view(-1) + experts_flat = topk_ids.view(-1) + local_experts_flat = expert_map[experts_flat] + + # Filter valid token-expert pairs + mask = local_experts_flat != -1 + filtered_weights = torch.where( + mask, weights_flat, torch.zeros_like(weights_flat)).to(dtype) + filtered_experts = torch.where( + mask, local_experts_flat, + torch.full_like(local_experts_flat, + num_experts)).to(topk_ids.dtype) + + # Sort by local expert IDs + sort_indices = torch.argsort(filtered_experts.view(torch.float32)) + sorted_token_indices = token_indices[sort_indices] + sorted_weights = filtered_weights[sort_indices] + + # Compute token counts with minlength of num_experts + # This is equivalent to but faster than: + # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1] + token_counts = torch.zeros(num_experts + 1, + device=device, + dtype=torch.int64) + ones = torch.ones_like(filtered_experts, dtype=torch.int64) + token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones) + token_counts = token_counts[:num_experts] + expert_tokens = torch.cumsum(token_counts, dim=0, dtype=torch.int64) + + # Rearrange hidden_states + sorted_hidden_states = hidden_states[sorted_token_indices] + else: + row_idx_len = num_tokens * top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=device).view(top_k, -1).permute( + 1, 0).contiguous()) + active_num = max_num_tokens if max_num_tokens is not None else num_tokens + sorted_hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=active_num) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, num_experts) + expert_tokens = expert_tokens.to(torch.int64) + + w1 = w1.transpose(1, 2) + gate_up_out_list = torch_npu.npu_grouped_matmul( + x=[sorted_hidden_states], + weight=[w1], + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + )[0] + + gate_up_out = torch_npu.npu_swiglu(gate_up_out_list) + + w2 = w2.transpose(1, 2) + down_out_list = torch_npu.npu_grouped_matmul( + x=[gate_up_out], + weight=[w2], + split_item=2, + group_list_type=0, + group_type=0, + group_list=expert_tokens, + )[0] + + if expert_map is not None: + weighted_down_out = down_out_list * sorted_weights.unsqueeze(1) + + final_hidden_states = torch.zeros(*original_shape, + device=hidden_states.device, + dtype=dtype) + + # TODO: npu_grouped_matmul output random values at [num_valid_tokens:, ...] + # This created multiple NaN and index_add_ will mix them up which harms accuracy + # remove this mask and filter after it being fixed + num_valid_tokens = mask.sum() + valid_token_mask = torch.arange( + 0, sorted_token_indices.shape[0], + device=device).unsqueeze(1) < num_valid_tokens + valid_output = torch.where( + valid_token_mask, weighted_down_out, + torch.zeros_like(weighted_down_out)).to(dtype) + final_hidden_states.index_add_(0, sorted_token_indices, valid_output) + else: + scales = torch.ones_like( + topk_weights) if apply_router_weight_on_input else topk_weights + # TODO: Reorder device memory 2 times here, replace the current + # implementation here when suitable operators become available. + final_hidden_states = torch_npu.npu_moe_finalize_routing( + down_out_list, + skip1=None, + skip2=None, + bias=None, + scales=scales, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + + return final_hidden_states + + +def torchair_native_grouped_topk( + topk_weights: torch.Tensor, + num_expert_group: Optional[int], + topk_group: Optional[int], +): + topk_group = 0 if topk_group is None else topk_group + num_expert_group = 0 if num_expert_group is None else num_expert_group + + num_token = topk_weights.shape[0] + grouped_weights = topk_weights.view(num_token, num_expert_group, + -1).max(dim=-1).values + topk_group_indices = torch.topk(grouped_weights.to(torch.float32), + k=topk_group, + dim=-1, + sorted=False)[1] + topk_group_mask = torch.zeros_like(grouped_weights) + topk_group_mask.scatter_(1, topk_group_indices, 1) + topk_weight_mask = (topk_group_mask.unsqueeze(-1).expand( + num_token, num_expert_group, + topk_weights.shape[-1] // num_expert_group).reshape(num_token, -1)) + topk_weights = topk_weights.masked_fill(~topk_weight_mask.bool(), 0.0) + + return topk_weights + + +def torchair_select_experts( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + use_grouped_topk: bool, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + global_num_experts: Optional[torch.Tensor] = None +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Select top-k experts based on router logits. + + Args: + hidden_states: Hidden states of shape (num_tokens, hidden_size). + router_logits: Router logits of shape (num_tokens, num_experts). + top_k: Number of experts to select. + use_grouped_topk: Whether to group experts before selecting top-k. + renormalize: Whether to renormalize the routing weights. + topk_group: Number of expert groups to select from. + num_expert_group: Number of experts in each group. + custom_routing_function: Custom routing function. + scoring_func: Scoring function to use. + e_score_correction_bias: Correction bias to apply to expert scores. + + Returns: + topk_weights: Routing weights of shape (num_tokens, top_k). + topk_ids: Selected expert IDs of shape (num_tokens, top_k). + + Raises: + ValueError: If an unsupported scoring function is provided. + """ + + def _renormalize_topk_weights( + topk_weights: torch.Tensor, + renormalize: bool, + ): + if renormalize: + topk_weights = topk_weights / topk_weights.sum(dim=-1, + keepdim=True) + return topk_weights + + if scoring_func == "softmax": + # NOTE: vLLM use dtype=torch.float here + if not use_grouped_topk and custom_routing_function is None: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax( + x=router_logits, finished=None, k=top_k) + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + return topk_weights, topk_ids + + topk_weights = router_logits.softmax(dim=-1) + elif scoring_func == "sigmoid": + topk_weights = router_logits.sigmoid() + else: + raise ValueError(f"Unsupported scoring function: {scoring_func}") + + if use_grouped_topk: + assert topk_group is not None + assert num_expert_group is not None + + if e_score_correction_bias is not None: + # Store original scores before applying correction bias. We use biased + # scores for expert selection but original scores for routing weights + original_weights = topk_weights + topk_weights = topk_weights + e_score_correction_bias.unsqueeze(0) + + # TODO: Change to npu_group_topk when the latest CANN and NNAL is available + # >>> torch_npu._npu_group_topk(topk_weights, group_num=num_expert_group, k=topk_group) + topk_weights = torchair_native_grouped_topk(topk_weights, + num_expert_group, + topk_group) + # TODO bfloat16 is not supported in torch.topk with ge graph. + if e_score_correction_bias is not None: + topk_ids = torch.topk(topk_weights.to(torch.float32), + k=top_k, + dim=-1, + sorted=False)[1] + # Use original unbiased scores for the routing weights + topk_weights = original_weights.gather(1, topk_ids) + else: + topk_weights, topk_ids = torch.topk(topk_weights.to(torch.float32), + k=top_k, + dim=-1, + sorted=False) + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + return topk_weights, topk_ids + + if custom_routing_function is not None: + topk_weights, topk_ids = custom_routing_function( + hidden_states=hidden_states, + gating_output=router_logits, + topk=top_k, + renormalize=renormalize, + global_num_experts=global_num_experts) + # Required by npu_moe_init_routing + topk_ids = topk_ids.to(torch.int32) + return topk_weights, topk_ids + + topk_weights, topk_ids = topk_weights.topk(top_k, dim=-1) + topk_weights = topk_weights.to(hidden_states.dtype) + + # Required by npu_moe_init_routing + topk_ids = topk_ids.to(torch.int32) + topk_weights = _renormalize_topk_weights(topk_weights, renormalize) + + return topk_weights, topk_ids + + +class TorchairAscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod): + + def __init__(self, moe: FusedMoEConfig = None): + + super().__init__(moe=moe) + vllm_config = get_current_vllm_config() + + self.global_batch_size = vllm_config.scheduler_config.max_num_seqs + self.max_model_len = vllm_config.model_config.max_model_len + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = None + + def process_weights_after_loading(self, layer): + super(UnquantizedFusedMoEMethod, + self).process_weights_after_loading(layer) + layer.w13_weight = torch.nn.Parameter(self._maybe_pad_weight( + layer.w13_weight.data), + requires_grad=False) + layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight( + layer.w2_weight.data), + requires_grad=False) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = False, + enable_force_load_balance: bool = False, + shared_experts: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + + is_deepseek_v3_r1 = global_num_experts == 256 + # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern + if is_deepseek_v3_r1: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( + router_logits, + k=top_k, # topk currently is 8 + bias=e_score_correction_bias, + k_group=topk_group, # fix: 4 + group_count=num_expert_group, # fix 8 + group_select_mode= + 1, # 0: the maximum in the group; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; should the third output be output + # y2_flag=False, # old api; should the third output be output + routed_scaling_factor=1, + eps=float(1e-20)) + else: + topk_weights, topk_ids = torchair_select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + topk_weights = topk_weights.to(x.dtype) + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance and not self.use_aclgraph: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + fused_moe_state = get_forward_context().fused_moe_state + + if fused_moe_state == FusedMoEState.MC2: + return torchair_fused_experts_with_mc2( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + moe_parallel_config=self.moe.moe_parallel_config, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + moe_all_to_all_group_name=self.moe_all_to_all_group_name, + shared_experts=shared_experts, + mc2_mask=kwargs.get("mc2_mask", None)) + elif fused_moe_state in [ + FusedMoEState.AllGather, FusedMoEState.NaiveMulticast + ]: + return torchair_fused_experts(hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map) + else: + return torchair_fused_experts_with_all2all( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + ep_group=get_ep_group()) + + +class TorchairAscendFusedMoE(FusedMoE): + + # The moe_counter parameter is required during the initialization of EPLB + # to identify the current layer index within the MOE model. + moe_counter = -1 + + def __init__( + self, + num_experts: int, # Global number of experts + top_k: int, + hidden_size: int, + intermediate_size: int, + params_dtype: Optional[torch.dtype] = None, + reduce_results: bool = False, + renormalize: bool = True, + use_grouped_topk: bool = False, + num_expert_group: Optional[int] = None, + topk_group: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + tp_size: Optional[int] = None, + ep_size: Optional[int] = None, + dp_size: Optional[int] = None, + prefix: str = "", + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + ): + # TODO: This could not initialize FusedMoE baseclass, + # fixme and make __init__() of AscendFusedMoE more clear + super().__init__( + num_experts=num_experts, + top_k=top_k, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + params_dtype=params_dtype, + reduce_results=reduce_results, + renormalize=renormalize, + use_grouped_topk=use_grouped_topk, + num_expert_group=num_expert_group, + topk_group=topk_group, + quant_config=quant_config, + tp_size=tp_size, + ep_size=ep_size, + dp_size=dp_size, + prefix=prefix, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + activation=activation, + ) + TorchairAscendFusedMoE.moe_counter += 1 + self.moe_instance_id = TorchairAscendFusedMoE.moe_counter + + if params_dtype is None: + params_dtype = torch.get_default_dtype() + + vllm_config = get_current_vllm_config() + + self.moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size + if dp_size is not None else get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config) + + self.top_k = top_k + self.num_experts = num_experts + self.global_num_experts = num_experts + assert intermediate_size % self.tp_size == 0 + self.intermediate_size_per_partition = intermediate_size // self.tp_size + self.reduce_results = reduce_results + self.renormalize = renormalize + self.use_grouped_topk = use_grouped_topk + if self.use_grouped_topk: + assert num_expert_group is not None and topk_group is not None + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.custom_routing_function = custom_routing_function + self.scoring_func = scoring_func + self.e_score_correction_bias = e_score_correction_bias + self.expert_map = None + self.activation = activation + self.log2phy = None + self.global_redundant_expert_num = 0 + + is_deepseek_v3_r1 = self.global_num_experts == 256 + self.rm_router_logits = get_rm_router_logits_state( + self.moe_parallel_config.ep_size, self.dp_size, is_deepseek_v3_r1) + self.all_reduce_merge = get_all_reduce_merge_state( + self.moe_parallel_config.ep_size, is_deepseek_v3_r1) + + ascend_config = get_ascend_config() + expert_map_path = ascend_config.expert_map_path + if expert_map_path and os.path.exists(expert_map_path): + # moe expert load balance + expert_load_balancer = ExpertLoadBalancer(expert_map_path, + self.global_num_experts) + self.local_num_experts, self.expert_map = \ + expert_load_balancer.get_rank_placement_map( + self.moe_instance_id, + get_ep_group().rank_in_group) + self.log2phy = expert_load_balancer.get_rank_log2phy_map( + self.moe_instance_id, + get_ep_group().rank_in_group) + self.global_redundant_expert_num = \ + expert_load_balancer.get_global_redundant_expert_num() + else: + # Create a tensor of size num_experts filled with -1 + self.local_num_experts, self.expert_map = determine_expert_map( + self.ep_size, + get_ep_group().rank_in_group, self.global_num_experts) + + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_multistream_moe = \ + ascend_config.torchair_graph_config.enable_multistream_moe and \ + self.torchair_graph_enabled + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + + if self.scoring_func != "softmax" and not self.use_grouped_topk: + raise ValueError("Only softmax scoring function is supported for " + "non-grouped topk.") + self.moe = FusedMoEConfig.make( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + # TODO (bnell): this needs to be fixed for quantized types. + in_dtype=params_dtype, + quant_config=quant_config) + + if quant_config is None: + self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( + self.moe) + else: + if quant_config.is_layer_skipped_ascend( + prefix, quant_config.packed_modules_mapping): + self.quant_method = TorchairAscendUnquantizedFusedMoEMethod( + self.moe) + else: + self.quant_method = AscendFusedMoEMethod( + quant_config, prefix, quant_config.packed_modules_mapping) + + assert self.quant_method is not None + + local_num_experts = torch.sum(self.expert_map != -1) \ + if self.expert_map is not None else num_experts + + moe_quant_params = { + "num_experts": local_num_experts, + "hidden_size": hidden_size, + "intermediate_size_per_partition": + self.intermediate_size_per_partition, + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + } + # need full intermediate size pre-sharding for WNA16 act order + if (self.quant_method.__class__.__name__ + in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")): + moe_quant_params["intermediate_size_full"] = intermediate_size + + self.ep_group = get_ep_group() + # NOTE: self.tp_group is not expert_tp_group + self.tp_group = get_tp_group().device_group + self.quant_method.create_weights(layer=self, **moe_quant_params) + + def naive_multicast(self, x: torch.Tensor, + cu_tokens_across_dp_cpu: torch.Tensor): + assert (len(x.shape) == 2) + buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)), + device=x.device, + dtype=x.dtype) + start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ + self.dp_rank - 1] + end = cu_tokens_across_dp_cpu[self.dp_rank] + buffer[start:end, :].copy_(x) + for idx in range(self.dp_size): + start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1] + end = cu_tokens_across_dp_cpu[idx] + get_dp_group().broadcast(buffer[start:end, :], idx) + return buffer + + def forward(self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_prefill: bool, + enable_force_load_balance: bool = False, + top_k: Optional[int] = None, + shared_experts: Optional[Any] = None, + gate=None, + replace_allreduce: bool = False, + _metadata_for_padding: Optional[MetadataForPadding] = None): + + assert self.quant_method is not None + + if top_k: + real_top_k = top_k + else: + real_top_k = self.top_k + + num_tokens, hidden_size = hidden_states.shape + + forward_context = get_forward_context() + fused_moe_state = forward_context.fused_moe_state + mc2_mask = forward_context.mc2_mask + # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel. + quantized_x_for_share, dynamic_scale_for_share = None, None + from vllm_ascend.quantization.w8a8_dynamic import \ + AscendW8A8DynamicFusedMoEMethod + if self.enable_multistream_moe: + if not self.rm_router_logits: + router_logits, _ = gate(hidden_states) + if hasattr(self.quant_method, "quant_method") and \ + isinstance(self.quant_method.quant_method, + AscendW8A8DynamicFusedMoEMethod + ) and fused_moe_state == FusedMoEState.MC2: + with npu_stream_switch("moe_secondary", 0): + quantized_x_for_share, dynamic_scale_for_share = torch_npu.npu_dynamic_quant( + hidden_states) + + if shared_experts: + if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2: + # When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce + shared_hidden_states = shared_experts(hidden_states) + + mc2_mask = forward_context.mc2_mask + + enable_sp = _metadata_for_padding is not None and _metadata_for_padding.not_dummy_and_is_prefill + tp_size = get_tensor_model_parallel_world_size() + if enable_sp: + tp_rank = get_tensor_model_parallel_rank() + mc2_mask_sp = _metadata_for_padding.mc2_mask if _metadata_for_padding is not None else forward_context.mc2_mask + chunk_mc2_mask = torch.tensor_split(mc2_mask_sp, tp_size, dim=0) + mc2_mask = chunk_mc2_mask[tp_rank] + replace_allreduce = True + + if (fused_moe_state not in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ] and not replace_allreduce): + if fused_moe_state in {FusedMoEState.MC2}: + padding_size = forward_context.padded_num_tokens + else: + # TODO: Determine if we can remove the padding + padding_size = tp_size + if num_tokens < padding_size and not self.enable_shared_expert_dp: + hidden_states = nn.functional.pad( + hidden_states, (0, 0, 0, padding_size - num_tokens)) + router_logits = nn.functional.pad( + router_logits, (0, 0, 0, padding_size - num_tokens)) + if tp_size > 1: + tp_rank = get_tensor_model_parallel_rank() + if not self.enable_shared_expert_dp: + chunk_hidden_states = torch.tensor_split(hidden_states, + tp_size, + dim=0) + chunk_router_logits = torch.tensor_split(router_logits, + tp_size, + dim=0) + hidden_states = chunk_hidden_states[tp_rank] + router_logits = chunk_router_logits[tp_rank] + + chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0) + mc2_mask = chunk_mc2_mask[tp_rank] + + if self.dp_size > 1: + if fused_moe_state == FusedMoEState.AllGather: + # NOTE: When in torchair graph, it has been padded in model_runner_v1 + if not self.torchair_graph_enabled: + max_tokens_across_dp = forward_context.max_tokens_across_dp + if num_tokens < max_tokens_across_dp: + hidden_states = nn.functional.pad( + hidden_states, + (0, 0, 0, max_tokens_across_dp - num_tokens)) + if not self.rm_router_logits: + router_logits = nn.functional.pad( + router_logits, + (0, 0, 0, max_tokens_across_dp - num_tokens)) + hidden_states = get_dp_group().all_gather(hidden_states, 0) + if self.rm_router_logits: + router_logits, _ = gate(hidden_states) + else: + router_logits = get_dp_group().all_gather(router_logits, 0) + + elif fused_moe_state == FusedMoEState.NaiveMulticast: + cu_tokens_across_dp_cpu = get_forward_context( + ).dp_metadata.cu_tokens_across_dp_cpu + hidden_states = self.naive_multicast(hidden_states, + cu_tokens_across_dp_cpu) + if self.rm_router_logits: + router_logits, _ = gate(hidden_states) + else: + router_logits = self.naive_multicast( + router_logits, cu_tokens_across_dp_cpu) + + # Matrix multiply. + e_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=real_top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + is_prefill=is_prefill, + enable_force_load_balance=enable_force_load_balance, + log2phy=self.log2phy, + global_redundant_expert_num=self.global_redundant_expert_num, + shared_experts=shared_experts if self.torchair_graph_enabled + and self.enable_multistream_moe and not is_prefill else None, + mc2_mask=mc2_mask, + quantized_x_for_share=quantized_x_for_share, + dynamic_scale_for_share=dynamic_scale_for_share, + ) + + if shared_experts: + if isinstance(e_hidden_states, tuple): + e_hidden_states, shared_hidden_states = e_hidden_states + + if (fused_moe_state not in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ] and not replace_allreduce and not self.enable_shared_expert_dp): + if tp_size > 1: + dist.all_gather(list(chunk_hidden_states), e_hidden_states, + self.tp_group) + final_hidden_states = torch.cat(chunk_hidden_states, dim=0) + dispose_tensor(e_hidden_states) + else: + final_hidden_states = e_hidden_states + if num_tokens < padding_size: + final_hidden_states = final_hidden_states[:num_tokens] + elif self.dp_size > 1 and not self.enable_shared_expert_dp: + if fused_moe_state == FusedMoEState.NaiveMulticast: + start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[ + self.dp_rank - 1] + end = cu_tokens_across_dp_cpu[self.dp_rank] + final_hidden_states = get_dp_group().all_reduce( + e_hidden_states) + final_hidden_states = final_hidden_states[start:end, :] + dispose_tensor(e_hidden_states) + elif fused_moe_state == FusedMoEState.AllGather: + final_hidden_states = data_parallel_reduce_scatter( + e_hidden_states, dim=0) + final_hidden_states = final_hidden_states[:num_tokens] + dispose_tensor(e_hidden_states) + else: + final_hidden_states = e_hidden_states + else: + final_hidden_states = e_hidden_states + + if tp_size > 1 and not self.all_reduce_merge and fused_moe_state in [ + FusedMoEState.AllGather, FusedMoEState.AllGatherEP, + FusedMoEState.NaiveMulticast + ]: + final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states) + + if shared_experts: + return final_hidden_states, shared_hidden_states + else: + return final_hidden_states + + # ----------------------------------------- TBO-related -------------------------------------------- + + def _forward_ms_fused_moe_comp( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + is_prefill: bool, + real_top_k, + enable_force_load_balance: bool = False, + ): + hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=real_top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + is_prefill=is_prefill, + enable_force_load_balance=enable_force_load_balance, + ) + + return hidden_states diff --git a/vllm_ascend/torchair/ops/torchair_rotary_embedding.py b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py new file mode 100644 index 0000000..5793288 --- /dev/null +++ b/vllm_ascend/torchair/ops/torchair_rotary_embedding.py @@ -0,0 +1,372 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +import math +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F +import torch_npu +from vllm.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, RotaryEmbedding) + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.utils import enable_custom_op, is_310p + + +def custom_rotary_embedding_enabled(query, neox_style, head_size): + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) + + +def rope_forward_oot( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + is_neox_style_override: Optional[bool] = None, + is_qwen_torchair: Optional[bool] = False, +) -> Tuple[torch.Tensor, torch.Tensor]: + if get_ascend_config( + ).torchair_graph_config.enabled and not is_qwen_torchair: + return self.forward_native( + positions, + query, + key, + offsets, + ) + + query_shape, key_shape = query.shape, key.shape + if self.cos_sin_cache.device != query.device: + self.cos_sin_cache = self.cos_sin_cache.to(query.device) + if self.cos_sin_cache.dtype != query.dtype: + self.cos_sin_cache = self.cos_sin_cache.to(query.dtype) + neox_style = self.is_neox_style + if is_neox_style_override is not None: + neox_style = is_neox_style_override + # adopt custom kernel path for rotary_embedding + if custom_rotary_embedding_enabled(query, neox_style, + self.head_size) and not is_310p(): + query, key = torch.ops._C.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + neox_style, + ) + return query.view(query_shape), key.view(key_shape) + if offsets is not None: + raise NotImplementedError( + "Batched rotary embedding is currently not supported on NPU.") + else: + # TODO: Remove the contiguous in the future. + query = query.contiguous().view(query.shape[0], -1) + key = key.contiguous().view(key.shape[0], -1) + torch_npu._npu_rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + neox_style, + ) + return query.view(query_shape), key.view(key_shape) + + +def native_rope_deepseek_forward(self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + max_seq_len: Optional[int] = None): + if max_seq_len is not None and max_seq_len > self.max_seq_len: + _set_cos_sin_cache(self, max_seq_len, query.device, query.dtype) + if len(key.shape) == 2: + key = key[:, None, :] + # Note: we implement the non neox_style method with shuffle the last dim and neox style + # calculation method which is also more compute friendly to the ascend machine + # https://huggingface.co/deepseek-ai/DeepSeek-V3-0324/blob/main/modeling_deepseek.py + neox_style = True + if self.is_neox_style is False: + b, h_q, d = query.shape + query = query.view(b, h_q, d // 2, 2).transpose(3, + 2).reshape(b, h_q, d) + b, h_k, d = key.shape + key = key.view(b, h_k, d // 2, 2).transpose(3, 2).reshape(b, h_k, d) + q_pe, k_pe = rope_forward_oot(self, positions, query, key, offsets, + neox_style) + return q_pe, k_pe + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +# Inverse dim formula to find dim based on number of rotations +def yarn_find_correction_dim(num_rotations, + dim, + base=10000, + max_position_embeddings=2048): + # Note: use torch instead of math to solve MTP compilation error. + return (dim * torch.log( + torch.tensor(max_position_embeddings) / + (num_rotations * 2 * torch.pi))) / (2 * torch.log(torch.tensor(base))) + + +def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float: + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + +# Find dim range bounds based on rotations +def yarn_find_correction_range(low_rot, + high_rot, + dim, + base=10000, + max_position_embeddings=2048): + # Note: use torch instead of math to solve MTP compilation error. + low = torch.floor( + yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = torch.ceil( + yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) + # Note: use torch instead of max/min to solve MTP compilation error. + return torch.clamp(low, min=0), torch.clamp(high, max=dim - 1) + + +def yarn_linear_ramp_mask(min_value, max_value, dim): + # Note: The if conditional branch is not used here + # to solve MTP compilation error. + max_value += (min_value == max_value).float() * 0.001 + linear_func = (torch.arange(dim, dtype=torch.float32) - + min_value) / (max_value - min_value) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids] + sin = sin[position_ids] + cos = cos[:, None, None, :] + sin = sin[:, None, None, :] + + if len(q.shape) == 3: + q = q[:, :, None, :] + if len(k.shape) == 2: + k = k[:, None, None, :] + elif len(k.shape) == 3: + k = k[:, :, None, :] + + b, h_q, s, d = q.shape + q = q.view(b, h_q, s, d // 2, 2).transpose(4, 3).reshape(b, h_q, s, d) + + b, h_k, s, d = k.shape + k = k.view(b, h_k, s, d // 2, 2).transpose(4, 3).reshape(b, h_k, s, d) + + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + + q_embed = q_embed.view(b, h_q, d) + k_embed = k_embed.view(b, h_k, d) + + return q_embed, k_embed + + +def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + dim = self.rotary_dim + + freq_extra = 1.0 / (self.base**( + torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)) + freq_inter = 1.0 / (self.scaling_factor * self.base**( + torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)) + + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + dim, + self.base, + self.max_position_embeddings, + ) + inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to( + device=device, dtype=torch.float32) + inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange(seq_len * self.scaling_factor, + device=device, + dtype=torch.float32) + + freqs = torch.outer(t, inv_freq) + cos_cached = torch.cat([freqs, freqs], dim=-1).cos() * self.mscale + sin_cached = torch.cat([freqs, freqs], dim=-1).sin() * self.mscale + cos_cached = cos_cached.to(dtype) + sin_cached = sin_cached.to(dtype) + cache = torch.cat([freqs.cos() * self.mscale, + freqs.sin() * self.mscale], + dim=-1).to(dtype) + self.register_buffer("cos_sin_cache", cache, persistent=False) + self.register_buffer("cos_cached", cos_cached, persistent=False) + self.register_buffer("sin_cached", sin_cached, persistent=False) + + +def __set_cos_sin_cache(self, seq_len, device, dtype): + inv_freq = 1.0 / (self.base**(torch.arange( + 0, self.rotary_dim, 2, device=device, dtype=torch.float32) * + (1 / self.rotary_dim))) + self.register_buffer("inv_freq", inv_freq) + + t = torch.arange(self.max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.float32) + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos", emb.cos().to(dtype=dtype), persistent=False) + self.register_buffer("sin", emb.sin().to(dtype=dtype), persistent=False) + self.embed = F.embedding + + +_original_re_init = RotaryEmbedding.__init__ + + +def qwen_rope_init_func( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: float, + is_neox_style: bool, + dtype: torch.dtype, +) -> None: + _original_re_init(self, head_size, rotary_dim, max_position_embeddings, + base, is_neox_style, dtype) + if get_ascend_config().torchair_graph_config.enabled: + __set_cos_sin_cache(self, + seq_len=max_position_embeddings, + device="npu", + dtype=dtype) + + +def rope_forward( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + is_neox_style_override: Optional[bool] = None, + max_seq_len: Optional[int] = None, + is_prefill: Optional[bool] = True, + is_qwen_torchair: Optional[bool] = False, +): + if get_ascend_config().torchair_graph_config.enabled \ + and is_qwen_torchair and not is_prefill: + if max_seq_len is not None and torch.gt(max_seq_len, + self.max_position_embeddings): + __set_cos_sin_cache(self, + seq_len=max_seq_len, + device=query.device, + dtype=torch.float32) + + # bsnd/bnsd + if positions is not None: + cos = self.embed(positions, self.cos) + sin = self.embed(positions, self.sin) + self.cos_embed = cos + self.sin_embed = sin + else: + cos = self.cos_embed + sin = self.sin_embed + + query = query.view(*query.shape[:-1], -1, self.head_size).contiguous() + key = key.view(*key.shape[:-1], -1, self.head_size).contiguous() + + cos = cos.unsqueeze(-2).unsqueeze(-2) + sin = sin.unsqueeze(-2).unsqueeze(-2) + + query = query.unsqueeze(1) + key = key.unsqueeze(1) + + q_embed, k_embed = torch_npu.npu_apply_rotary_pos_emb( + query, key, cos, sin) + return q_embed.flatten(-2), k_embed.flatten(-2) + else: + return rope_forward_oot(self, positions, query, key, offsets, + is_neox_style_override, + is_qwen_torchair) # type: ignore + + +def deepseek_rope_init_func( + self, + head_size: int, + rotary_dim: int, + max_position_embeddings: int, + base: int, + is_neox_style: bool, + scaling_factor: float, + dtype: torch.dtype, + *, + extrapolation_factor: float = 1, + attn_factor: float = 1, + beta_fast: int = 32, + beta_slow: int = 1, + mscale: float = 1, + mscale_all_dim: float = 0, +) -> None: + self.scaling_factor = scaling_factor + self.extrapolation_factor = extrapolation_factor + self.attn_factor = attn_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + # Get n-d magnitude scaling corrected for interpolation. + self.mscale = float( + yarn_get_mscale(self.scaling_factor, float(mscale)) / + yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * + attn_factor) + super(DeepseekScalingRotaryEmbedding, + self).__init__(head_size, rotary_dim, max_position_embeddings, base, + is_neox_style, dtype) + self.max_seq_len = max_position_embeddings + _set_cos_sin_cache(self, + max_position_embeddings, + dtype=dtype, + device="npu") diff --git a/vllm_ascend/torchair/quantization/__init__.py b/vllm_ascend/torchair/quantization/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/torchair/quantization/torchair_quantizer.py b/vllm_ascend/torchair/quantization/torchair_quantizer.py new file mode 100644 index 0000000..1d1d584 --- /dev/null +++ b/vllm_ascend/torchair/quantization/torchair_quantizer.py @@ -0,0 +1,29 @@ +from vllm_ascend.quantization.quantizer import VLLMAscendQuantizer +from vllm_ascend.torchair.quantization.torchair_w4a8_dynamic import ( + TorchairAscendW4A8DynamicFusedMoEMethod, + TorchairAscendW4A8DynamicLinearMethod) +from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import ( + TorchairAscendW8A8DynamicFusedMoEMethod, + TorchairAscendW8A8DynamicLinearMethod) + + +class TorchairW8A8DYNAMICQuantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return TorchairAscendW8A8DynamicLinearMethod() + + @staticmethod + def build_moe_method(): + return TorchairAscendW8A8DynamicFusedMoEMethod() + + +class TorchairW4A8DYNAMICQuantizer(VLLMAscendQuantizer): + + @staticmethod + def build_linear_method(): + return TorchairAscendW4A8DynamicLinearMethod() + + @staticmethod + def build_moe_method(): + return TorchairAscendW4A8DynamicFusedMoEMethod() diff --git a/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py new file mode 100644 index 0000000..f38e2d8 --- /dev/null +++ b/vllm_ascend/torchair/quantization/torchair_w4a8_dynamic.py @@ -0,0 +1,439 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional + +import numpy as np +import torch +import torch_npu +from vllm.config import get_current_vllm_config +from vllm.distributed import get_ep_group +from vllm.forward_context import get_forward_context + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.torchair.ops.torchair_fused_moe import torchair_select_experts +from vllm_ascend.torchair.quantization.torchair_w8a8_dynamic import ( + torchair_fused_experts_with_all2all, torchair_fused_experts_with_mc2) +from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor + + +class TorchairAscendW4A8DynamicLinearMethod: + """Linear method for Ascend W4A8_DYNAMIC + """ + + def __init__(self): + self.transpose_weight = True + try: + self.group_size = get_current_vllm_config( + ).quant_config.quant_description.get("group_size", 256) + except AttributeError: + self.group_size = 256 + + @staticmethod + def get_weight(input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def get_perchannel_param(output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_scale_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + params_dict["weight_offset_second"] = torch.empty(output_size, + input_size // + self.group_size, + dtype=params_dtype) + return params_dict + + @staticmethod + def process_scale_second(weight: torch.Tensor, scale: torch.Tensor, + per_group_scale: torch.Tensor): + k, n = weight.shape + group_num, n = per_group_scale.shape + weight_high = weight.to(torch.float32).reshape( + group_num, -1, n) * per_group_scale.reshape(group_num, 1, n) + weight_high = weight_high.reshape(k, n) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(dim=0) + antiquant_scale = (scale * per_group_scale).reshape(group_num, n) + return antiquant_scale.npu(), bias + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = None, + ) -> torch.Tensor: + return torch_npu.npu_weight_quant_batchmatmul( + x, + layer.weight, + antiquant_scale=layer.weight_scale_second.to(x.dtype), + antiquant_group_size=self.group_size, + ) + + def process_weights_after_loading(self, layer: torch.nn.Module): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + layer.weight_scale.data = layer.weight_scale.data.flatten().to( + torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + layer.weight_scale_second.data, scale_bias = self.process_scale_second( + layer.weight.data, + layer.weight_scale.data, + layer.weight_scale_second.data.transpose(0, 1).contiguous(), + ) + param = torch.nn.Parameter(scale_bias, requires_grad=False) + layer.register_parameter("weight_scale_bias", param) + layer.weight.data = torch_npu.npu_convert_weight_to_int4pack( + layer.weight.data.to(torch.int32)) + + +class TorchairAscendW4A8DynamicFusedMoEMethod: + """FusedMoe method for Ascend W4A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + self.ep_group = get_ep_group() + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + vllm_config = get_current_vllm_config() + self.group_size = vllm_config.quant_config.quant_description.get( + "group_size", 256) + quant_version = vllm_config.quant_config.quant_description.get( + "version", "0") + # NOTE: new quantize weights: 2 int4 pack into int8 + self.new_quant_version = quant_version == "1.0.0" + self.tp_size = 1 if vllm_config.parallel_config.enable_expert_parallel else self.ep_group.world_size + if self.new_quant_version and self.tp_size > 16: + raise ValueError( + "The current weight does not support moe part tp>16.") + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = "" + + def get_weight(self, num_experts: int, + intermediate_size_per_partition: int, hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + if self.new_quant_version: + w13_output_size = intermediate_size_per_partition + w2_output_size = hidden_sizes // 2 + else: + w13_output_size = 2 * intermediate_size_per_partition + w2_output_size = hidden_sizes + + param_dict["w13_weight"] = torch.empty(num_experts, + w13_output_size, + hidden_sizes, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(num_experts, + w2_output_size, + intermediate_size_per_partition, + dtype=torch.int8) + return param_dict + + def get_dynamic_quant_param(self, num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + + param_dict["w13_weight_scale_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // self.group_size, + dtype=params_dtype) + + param_dict["w13_weight_offset_second"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_sizes // self.group_size, + dtype=params_dtype) + + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_scale_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // self.group_size, + dtype=params_dtype) + param_dict["w2_weight_offset_second"] = torch.empty( + num_experts, + hidden_sizes, + intermediate_size_per_partition // self.group_size, + dtype=params_dtype) + + if self.new_quant_version: + param_dict["w13_scale_bias"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=torch.float32) + param_dict["w2_scale_bias"] = torch.empty(num_experts, + hidden_sizes, + 16 // self.tp_size, + dtype=torch.float32) + + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = True, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + if global_num_experts == 256: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( + router_logits, + k=top_k, # topk currently is 8 + bias=e_score_correction_bias, + k_group=topk_group, # fix: 4 + group_count=num_expert_group, # fix 8 + group_select_mode= + 1, # 0: the maximum in the group; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; should the third output be output + # y2_flag=False, # old api; should the third output be output + routed_scaling_factor=1, + eps=float(1e-20)) + else: + topk_weights, topk_ids = torchair_select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + fused_moe_state = get_forward_context().fused_moe_state + shared_gate_up, shared_dequant_scale = None, None + if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(quantized_x_for_share, router_logits) + share_up_out, _ = shared_experts.gate_up_proj( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_gate_up, shared_dequant_scale = share_up_out[ + 0], share_up_out[1] + + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + if fused_moe_state == FusedMoEState.MC2: + return torchair_fused_experts_with_mc2( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_second, + w2_scale=layer.w2_weight_scale_second, + w1_scale_bias=layer.w13_scale_bias, + w2_scale_bias=layer.w2_scale_bias, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + moe_all_to_all_group_name=self.moe_all_to_all_group_name, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + is_torchair=self.torchair_graph_enabled, + quantized_x_for_share=shared_gate_up, + dynamic_scale_for_share=shared_dequant_scale, + mc2_mask=kwargs.get("mc2_mask", None)) + else: + # The current implementation of deepseek moe splits hidden_states + # according to tp_size before they are feed into layers module. + # Therefore, all2all is needed no matter how dp/tp is set so as to + # dispatch/combine tokens. + return torchair_fused_experts_with_all2all( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_second, + w2_scale=layer.w2_weight_scale_second, + w1_scale_bias=layer.w13_scale_bias, + w2_scale_bias=layer.w2_scale_bias, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + ep_group=self.ep_group, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + ) + + def process_scale(self, weight: torch.Tensor, scale, per_group_scale): + group_num, k, n = weight.shape + # the weight of the new version is reduced by half by pack n, so it needs to be restored + if self.new_quant_version: + n = n * 2 + per_group_scale = per_group_scale.reshape(group_num, -1, n) + group_num, quantgroup_num, n = per_group_scale.shape + bias = None + if not self.new_quant_version: + weight_high = weight.to(torch.float32).reshape([group_num, quantgroup_num, -1, n]) * \ + per_group_scale.reshape([group_num, quantgroup_num, 1, n]) + weight_high = weight_high.reshape([group_num, k, n]) + bias = 8 * (weight_high.to(torch.float32) * scale).sum(axis=1) + scale_fp32 = (scale * per_group_scale).to(torch.float16).to( + torch.float32) + scale_fp32_np = scale_fp32.cpu().numpy() + scale_fp32_np.dtype = np.uint32 + sscale_uint64 = np.zeros((group_num, quantgroup_num, n * 2), + dtype=np.uint32) + + sscale_uint64[..., ::2] = scale_fp32_np + + sscale_uint64_buffer = np.frombuffer(sscale_uint64.tobytes(), + dtype=np.int64).copy() + sscale_uint64_tensor = torch.from_numpy(sscale_uint64_buffer).reshape( + group_num, quantgroup_num, n) + sscale_uint64_tensor = sscale_uint64_tensor.npu() + return sscale_uint64_tensor, bias + + def update_bias(self, layer, w13_bias, w2_bias): + if self.new_quant_version: + layer.w13_scale_bias.data = layer.w13_scale_bias.data.transpose( + 1, 2).contiguous().sum(axis=1) + layer.w2_scale_bias.data = layer.w2_scale_bias.data.transpose( + 1, 2).contiguous().sum(axis=1) + else: + w13_scale_bias = torch.nn.Parameter(w13_bias, requires_grad=False) + layer.register_parameter("w13_scale_bias", w13_scale_bias) + w2_scale_bias = torch.nn.Parameter(w2_bias, requires_grad=False) + layer.register_parameter("w2_scale_bias", w2_scale_bias) + + def pack_to_int32(self, weight: torch.Tensor): + if self.new_quant_version: + group_num, k, n = weight.shape + assert n % 4 == 0, "the last dim of weight needs to be divided by 4" + packed_n = n // 4 + # pack 4 int8(int4*2) to int32, because in pytorch, we need to use int32 to represent int4 + packed_weight = torch.from_numpy( + np.frombuffer(weight.cpu().numpy().tobytes(), dtype=np.int32)) + return packed_weight.reshape(group_num, k, packed_n).npu() + else: + return torch_npu.npu_quantize(weight.to(torch.float32), + torch.tensor([1.]).npu(), None, + torch.quint4x2, -1, False) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale.data = layer.w13_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale.data = layer.w2_weight_scale.data.transpose( + 1, 2).contiguous() + layer.w13_weight_scale_second.data = layer.w13_weight_scale_second.data.transpose( + 1, 2).contiguous() + layer.w2_weight_scale_second.data = layer.w2_weight_scale_second.data.transpose( + 1, 2).contiguous() + + layer.w13_weight_scale_second.data, w13_bias = self.process_scale( + layer.w13_weight, layer.w13_weight_scale.data, + layer.w13_weight_scale_second.data) + layer.w2_weight_scale_second.data, w2_bias = self.process_scale( + layer.w2_weight, layer.w2_weight_scale.data, + layer.w2_weight_scale_second.data) + + self.update_bias(layer, w13_bias, w2_bias) + + layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) + layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) diff --git a/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py new file mode 100644 index 0000000..5c3fa95 --- /dev/null +++ b/vllm_ascend/torchair/quantization/torchair_w8a8_dynamic.py @@ -0,0 +1,1035 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import torch +import torch.distributed as dist +import torch_npu +from vllm.distributed import GroupCoordinator, get_ep_group +from vllm.forward_context import get_forward_context + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import FusedMoEState +from vllm_ascend.distributed.parallel_state import get_mc2_group +from vllm_ascend.torchair.ops.torchair_fused_moe import torchair_select_experts +from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendSocVersion, + dispose_tensor, get_ascend_soc_version) + + +def torchair_apply_mlp_decode(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + Args: + hidden_states_wrapper: wrapper of input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + w2_scale: weights2 scale with shape (num_experts, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + Returns: + hidden_states: output hidden states after MLP. + """ + + if dynamic_scale is None: + unquantized_hidden_states = hidden_states + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + # Dispose the original unquantized hidden states + # to save npu memory because they're no longer used. + dispose_tensor(unquantized_hidden_states) + else: + pertoken_scale = dynamic_scale + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=3, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=torch.int32)[0] + + # act_fn: swiglu + hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=w1_scale, + activation_scale=pertoken_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=group_list, + activate_left=True, + quant_mode=1, + ) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=w2_scale.dtype)[0] + return hidden_states + + +def torchair_apply_mlp(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + group_list: torch.Tensor, + dynamic_scale: torch.Tensor = None, + group_list_type: int = 1, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None) -> torch.Tensor: + """ + apply MLP: gate_up_proj -> swiglu -> down_proj + + Args: + hidden_states: input hidden states with shape (num_tokens, hidden_size). + w1: expert weights1 with shape + (num_experts, hidden_size, intermediate_size * 2) + w1_scale: weights1 scale with shape (num_experts, intermediate_size * 2) + w2: expert weights2 with shape + (num_experts, intermediate_size, hidden_size) + w2_scale: weights2 scale with shape (num_experts, hidden_size) + group_list: number of tokens for each expert, follow cumsum mode, and + with shape (num_experts). + transpose_weight: + w1: (num_experts, intermediate_size * 2, hidden_size) -> + (num_experts, hidden_size, intermediate_size * 2) + w2: (num_experts, hidden_size, intermediate_size) -> + (num_experts, intermediate_size, hidden_size) + + Returns: + hidden_states: output hidden states after MLP. + """ + + if dynamic_scale is None: + unquantized_hidden_states = hidden_states + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant( + hidden_states) + # Dispose the original unquantized hidden states + # to save npu memory because they're no longer used. + dispose_tensor(unquantized_hidden_states) + else: + pertoken_scale = dynamic_scale + + bias1, bias2 = None, None + _output_dtype = w2_scale.dtype + + if w1_scale_bias is not None: + if group_list_type == 0: + group_list = torch.cat( + [group_list[:1], torch.diff(group_list, dim=0)]) + group_list_type = 1 + bias1 = [w1_scale_bias] + bias2 = [w2_scale_bias] + # TODO w4a8 scene: dynamic acquisition of dtype in the future + _output_dtype = torch.bfloat16 + + # gmm1: gate_up_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + scale=[w1_scale], + bias=bias1, + per_token_scale=[pertoken_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + + # act_fn: swiglu + hidden_states = torch_npu.npu_swiglu(hidden_states) + hidden_states, swiglu_out_scale = torch_npu.npu_dynamic_quant( + hidden_states) + + # gmm2: down_proj + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w2], + scale=[w2_scale], + bias=bias2, + per_token_scale=[swiglu_out_scale], + split_item=2, + group_list_type=group_list_type, + group_type=0, + group_list=group_list, + output_dtype=_output_dtype)[0] + + return hidden_states + + +def torchair_fused_experts_with_mc2( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + moe_all_to_all_group_name: str = "", + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + is_torchair: bool = False, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + mc2_mask: Optional[torch.Tensor] = None, + shared_gate_up: Optional[Any] = None, + shared_dequant_scale: Optional[Any] = None, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + assert mc2_mask is not None + if log2phy is not None: + topk_ids = log2phy[topk_ids] + + quant_mode = 2 + ep_group = get_mc2_group() + ep_rank_id = ep_group.rank_in_group + ep_world_size = ep_group.world_size + + # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine + need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3 + or is_torchair) + + # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine + a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3 + + enable_dispatch_v2 = hasattr(torch_npu, "npu_moe_distribute_dispatch_v2") + + if (expert_map is not None): + moe_expert_num = len(expert_map) + global_redundant_expert_num + else: + moe_expert_num = global_redundant_expert_num + # hidden_states = hidden_states.bfloat16() + kwargs_mc2 = { + "x": hidden_states, + "expert_ids": topk_ids, + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + + stage1_kwargs = { + "scales": None, + "quant_mode": quant_mode, + "group_ep": moe_all_to_all_group_name, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, + } + if need_extra_args: + stage1_kwargs.update({ + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if a3_need_extra_args and enable_dispatch_v2: + stage1_kwargs.update({ + "x_active_mask": mc2_mask, + }) + kwargs_mc2.update(stage1_kwargs) + + output = torch_npu.npu_moe_distribute_dispatch_v2( + **kwargs_mc2 + ) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_dispatch( + **kwargs_mc2) + # comm_stream.wait_stream(torch.npu.current_stream()) + expand_x, dynamic_scale, assist_info_for_combine, expert_token_nums, ep_recv_counts = output[ + 0:5] + + if shared_experts is not None: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_gate_up, expand_x) + shared_act_out = shared_experts.act_fn( + (shared_gate_up, shared_dequant_scale)) + shared_act, swiglu_out_scale = shared_act_out[0], shared_act_out[1] + + # `expand_x` will be disposed in the `apply_mlp` function + if w1_scale_bias is None: + down_out_list = torchair_apply_mlp_decode(expand_x, + w1, + w1_scale, + w2, + w2_scale, + expert_token_nums, + dynamic_scale=dynamic_scale) + else: + # w4a8 scene, cannot use apply_mlp_decode because the operator is not supported + down_out_list = torchair_apply_mlp(expand_x, + w1, + w1_scale, + w2, + w2_scale, + expert_token_nums, + dynamic_scale=dynamic_scale, + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias) + + # moeCombine + kwargs_mc2 = { + "expand_x": down_out_list, + "expert_ids": topk_ids, + "expert_scales": topk_weights.to(torch.float32), + "expert_shard_type": 0, + "shared_expert_rank_num": 0, + "moe_expert_num": moe_expert_num, + "global_bs": 0, + } + tp_recv_counts = torch.empty(1, + dtype=torch.int32, + device=hidden_states.device) + stage3_kwargs = { + "ep_send_counts": ep_recv_counts, + "group_ep": moe_all_to_all_group_name, + "ep_world_size": ep_world_size, + "ep_rank_id": ep_rank_id, + } + if enable_dispatch_v2: + stage3_kwargs.update({ + "assist_info_for_combine": + assist_info_for_combine, + }) + else: + stage3_kwargs.update({ + "expand_idx": assist_info_for_combine, + }) + if need_extra_args: + stage3_kwargs.update({ + "tp_send_counts": tp_recv_counts, + "group_tp": moe_all_to_all_group_name, + "tp_world_size": 1, + "tp_rank_id": 0, + }) + if a3_need_extra_args and enable_dispatch_v2: + stage3_kwargs.update({ + "x_active_mask": mc2_mask, + }) + kwargs_mc2.update(stage3_kwargs) + + hidden_states = torch_npu.npu_moe_distribute_combine_v2( + **kwargs_mc2 + ) if enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine( + **kwargs_mc2) + + if shared_experts is None: + return hidden_states + else: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(shared_act, down_out_list) + shared_output, _ = shared_experts.down_proj( + (shared_act, swiglu_out_scale)) + return hidden_states, shared_output + + +def torchair_init_routing_quant(hidden_states, top_k, topk_ids, + global_num_experts): + num_tokens, _ = hidden_states.shape + row_idx_len = num_tokens * top_k + row_idx = (torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=hidden_states.device).view( + top_k, -1).permute(1, 0).contiguous()) + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + expanded_row_idx = (expanded_row_idx.view(top_k, -1).permute( + 1, 0).contiguous().view(-1)) + global_expert_tokens = torch.bincount(expanded_expert_idx, + minlength=global_num_experts) + global_expert_tokens = global_expert_tokens.to(torch.int32) + quantized_tokens, token_scales = torch_npu.npu_dynamic_quant(hidden_states) + return quantized_tokens, expanded_row_idx, global_expert_tokens, token_scales + + +# currently expert parallelism implemented with all2all +# is under-optimized. +def torchair_fused_experts_with_all2all( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None, + ep_group: GroupCoordinator = None, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + w1_scale_bias: torch.Tensor = None, + w2_scale_bias: torch.Tensor = None, +): + if log2phy is not None: + topk_ids = log2phy[topk_ids] + original_shape = hidden_states.shape + if len(original_shape) == 3: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + num_tokens, _ = hidden_states.shape + num_experts = w1.shape[0] + + if expert_map is not None: + global_num_experts = len(expert_map) + global_redundant_expert_num + if hasattr(torch_npu, "npu_moe_init_routing_quant"): + quantized_tokens, expanded_row_idx, global_expert_tokens, _, token_scales = torch_npu.npu_moe_init_routing_quant( + hidden_states, + expert_idx=topk_ids.to(torch.int32), + active_num=0, + expert_capacity=0, + expert_num=global_num_experts, + drop_pad_mode=0, + expert_tokens_num_mode=2, + expert_tokens_before_capacity_flag=False, + quant_mode=1, + ) + else: + quantized_tokens, expanded_row_idx, global_expert_tokens, token_scales = torchair_init_routing_quant( + hidden_states, top_k, topk_ids, global_num_experts) + + gather_sizes = global_expert_tokens.new_empty( + global_expert_tokens.shape[0]) + dist.all_to_all_single(gather_sizes, global_expert_tokens) + + token_counts_combined = torch.stack( + [gather_sizes, global_expert_tokens], dim=0) + token_counts_combined = token_counts_combined.view( + 2, ep_group.world_size, -1).sum(dim=2) + token_counts_combined_cpu = token_counts_combined.to( + torch.device("cpu"), non_blocking=True).numpy() + all_tokens = gather_sizes.sum() + + gathered_tokens = quantized_tokens.new_empty(all_tokens.item(), + quantized_tokens.shape[1]) + dynamic_scale = token_scales.new_empty(gathered_tokens.shape[0]) + gather_size_list = token_counts_combined_cpu[1] + scatter_size_list = token_counts_combined_cpu[0] + + dist.all_to_all_single(gathered_tokens, quantized_tokens, + scatter_size_list, gather_size_list) + dist.all_to_all_single(dynamic_scale, token_scales, scatter_size_list, + gather_size_list) + + hidden_states, dynamic_scale, inverse_indices, expert_tokens = torch_npu.npu_moe_re_routing( + gathered_tokens, + gather_sizes.view(ep_group.world_size, -1), + per_token_scales=dynamic_scale) + expert_tokens = expert_tokens.to(torch.int64) + group_list_type = 1 + else: + row_idx_len = num_tokens * top_k + row_idx = torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=topk_weights.device).view( + top_k, -1).permute(1, 0).contiguous() + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, num_experts) + expert_tokens = expert_tokens.to(torch.int64) + group_list_type = 0 + dynamic_scale = None + + # `hidden_states` will be disposed in the `apply_mlp` function + hidden_states = torchair_apply_mlp( + hidden_states, + w1, + w1_scale, #17 + w2, + w2_scale, + expert_tokens, #16 + dynamic_scale=dynamic_scale, + group_list_type=group_list_type, + w1_scale_bias=w1_scale_bias, + w2_scale_bias=w2_scale_bias) + + if expert_map is not None: + reordered_outputs = torch.index_select( + hidden_states, + dim=0, + # Workaround: Convert to float so that argsort runs on AI Core instead of slower AICPU + index=inverse_indices.to(torch.float32).argsort().to(torch.int32)) + + hidden_states = reordered_outputs.new_empty(*quantized_tokens.shape) + dist.all_to_all_single(hidden_states, reordered_outputs, + gather_size_list, scatter_size_list) + + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=None, + drop_pad_mode=2) + else: + # TODO: Reorder device memory 2 times here, replace the current + # implementation here when suitable operators become available. + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) + return final_hidden_states + + +def torchair_fused_experts_with_allgather(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None): + original_shape = hidden_states.shape + if len(original_shape) == 3: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + num_tokens = hidden_states.shape[0] + batch_size, hidden_size = hidden_states.shape + topk_weights = topk_weights.to(hidden_states.dtype) + + ep_group = get_ep_group().device_group + ep_rank = torch.distributed.get_rank(group=ep_group) + ep_size = torch.distributed.get_world_size(ep_group) + + global_num_experts = len(expert_map) + local_num_experts = global_num_experts // ep_size + + hidden_states, pertoken_scale = torch_npu.npu_dynamic_quant(hidden_states) + + hidden_states, expanded_x_idx, expert_tokens, pertoken_scale = torch_npu.npu_moe_init_routing_v2( + hidden_states, + topk_ids, + scale=pertoken_scale, + offset=None, + active_num=num_tokens * top_k, + expert_num=global_num_experts, + expert_tokens_num_type=1, + expert_tokens_num_flag=True, + active_expert_range=[ + ep_rank * local_num_experts, (ep_rank + 1) * local_num_experts + ], + quant_mode=-1, + row_idx_type=1) + group_list_type = 1 + + sorted_topk_weight = torch.index_select(topk_weights.view(-1), 0, + expanded_x_idx) + row_index = expanded_x_idx // topk_ids.shape[-1] + row_index = row_index.to(torch.int64) + share_input = torch.zeros((batch_size, hidden_size), + dtype=torch.bfloat16, + device="npu") + + hidden_states = torch_npu.npu_grouped_matmul( + x=[hidden_states], + weight=[w1], + split_item=3, + group_list_type=group_list_type, + group_type=0, + group_list=expert_tokens, + output_dtype=torch.int32)[0] + + # act_fn: swiglu + hidden_states, pertoken_scale = torch_npu.npu_dequant_swiglu_quant( + x=hidden_states, + weight_scale=w1_scale.to(torch.float32), + activation_scale=pertoken_scale, + bias=None, + quant_scale=None, + quant_offset=None, + group_index=expert_tokens, + activate_left=True, + quant_mode=1, + ) + + final_hidden_states = torch_npu.npu_grouped_matmul_finalize_routing( + hidden_states, + w2, + scale=w2_scale.to(torch.float32), + bias=None, + pertoken_scale=pertoken_scale.view(-1), + group_list=expert_tokens, + shared_input=share_input, + logit=sorted_topk_weight.to(torch.float32), + row_index=row_index, + output_bs=batch_size).to(torch.bfloat16) + + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) + + return final_hidden_states + + +def torchair_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w1_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + top_k: int, + expert_map: torch.Tensor = None): + original_shape = hidden_states.shape + if len(original_shape) == 3: + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + num_tokens, _ = hidden_states.shape + num_experts = w1.shape[0] + dtype = hidden_states.dtype + device = hidden_states.device + + if expert_map is not None: + # Generate token indices and flatten + token_indices = (torch.arange(num_tokens, + device=device, + dtype=torch.int64).unsqueeze(1).expand( + -1, top_k).reshape(-1)) + + # Flatten token-to-expert mappings and map to local experts + weights_flat = topk_weights.view(-1) + experts_flat = topk_ids.view(-1) + local_experts_flat = expert_map[experts_flat] + + # Filter valid token-expert pairs + mask = local_experts_flat != -1 + filtered_weights = torch.where( + mask, weights_flat, torch.zeros_like(weights_flat)).to(dtype) + filtered_experts = torch.where( + mask, local_experts_flat, + torch.full_like(local_experts_flat, + num_experts)).to(topk_ids.dtype) + + # Sort by local expert IDs + sort_indices = torch.argsort(filtered_experts) + sorted_token_indices = token_indices[sort_indices] + sorted_weights = filtered_weights[sort_indices] + + # Compute token counts with minlength of num_experts + # This is equivalent to but faster than: + # >>> token_counts = torch.bincount(filtered_experts, minlength=num_experts)[:-1] + token_counts = torch.zeros(num_experts + 1, + device=device, + dtype=torch.int64) + ones = torch.ones_like(filtered_experts, dtype=torch.int64) + token_counts.scatter_add_(0, filtered_experts.to(torch.int64), ones) + expert_tokens = token_counts[:num_experts] + # Rearrange hidden_states + hidden_states = hidden_states[sorted_token_indices] + group_list_type = 1 + else: + row_idx_len = num_tokens * top_k + row_idx = torch.arange(0, + row_idx_len, + dtype=torch.int32, + device=topk_weights.device).view( + top_k, -1).permute(1, 0).contiguous() + hidden_states, expanded_row_idx, expanded_expert_idx = torch_npu.npu_moe_init_routing( + hidden_states, + row_idx=row_idx, + expert_idx=topk_ids, + active_num=num_tokens) + + expert_tokens = torch_npu.npu_moe_compute_expert_tokens( + expanded_expert_idx, num_experts) + expert_tokens = expert_tokens.to(torch.int64) + group_list_type = 0 + + # `hidden_states` will be disposed in the `apply_mlp` function + hidden_states = torchair_apply_mlp(hidden_states, + w1, + w1_scale, + w2, + w2_scale, + expert_tokens, + group_list_type=group_list_type) + + if expert_map is not None: + hidden_states.mul_(sorted_weights.unsqueeze(1)) + final_hidden_states = torch.zeros(*original_shape, + device=device, + dtype=dtype) + + num_valid_tokens = mask.sum() + valid_token_mask = torch.arange( + 0, sorted_token_indices.shape[0], + device=device).unsqueeze(1) < num_valid_tokens + hidden_states = hidden_states.masked_fill_(~valid_token_mask, + 0).to(dtype) + final_hidden_states.index_add_(0, sorted_token_indices, hidden_states) + else: + # TODO: Reorder device memory 2 times here, replace the current + # implementation here when suitable operators become available. + final_hidden_states = torch_npu.npu_moe_finalize_routing( + hidden_states, + skip1=None, + skip2=None, + bias=None, + scales=topk_weights, + expanded_src_to_dst_row=expanded_row_idx, + export_for_source_row=topk_ids, + ) + + if len(original_shape) == 3: + final_hidden_states = final_hidden_states.view(original_shape) + return final_hidden_states + + +class TorchairAscendW8A8DynamicLinearMethod: + """Linear method for Ascend W8A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + @staticmethod + def get_weight(input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + params_dict = { + "weight": torch.empty(output_size, input_size, dtype=torch.int8) + } + return params_dict + + @staticmethod + def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def get_perchannel_param( + output_size: int, + params_dtype: torch.dtype, + ) -> Dict[str, Any]: + params_dict = {} + params_dict["weight_scale"] = torch.empty(output_size, + 1, + dtype=params_dtype) + params_dict["weight_offset"] = torch.empty(output_size, + 1, + dtype=params_dtype) + return params_dict + + def get_pergroup_param(self, input_size: int, output_size: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + return {} + + @staticmethod + def apply( + layer: torch.nn.Module, + x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], + bias: Optional[torch.Tensor] = None, + tp_rank: Optional[int] = 0, + ) -> torch.Tensor: + config = getattr(layer, "_ascend_quant_config", {}) + if not isinstance(x, tuple): + output_dtype = config.get("output_dtype", x.dtype) + quantized_x, dynamic_scale = torch_npu.npu_dynamic_quant(x) + else: + assert "output_dtype" in config.keys(), ( + f"DynamicLinearMethod needs explicitly specified `output_dtype`" + f"for pre-quantized input, got config [{config}]") + output_dtype = config["output_dtype"] + quantized_x, dynamic_scale = x + pertoken_scale = (dynamic_scale + if config.get("pertoken_scale", True) else None) + + output = torch_npu.npu_quant_matmul( + quantized_x, + layer.weight, + layer.weight_scale, + pertoken_scale=pertoken_scale, + bias=bias, + output_dtype=output_dtype, + ) + return ((output, dynamic_scale) + if config.get("return_scale", False) else output) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.weight.data = layer.weight.data.transpose(0, 1).contiguous() + # cast quantized weight tensors in NZ format (29) for higher inference speed + layer.weight.data = torch_npu.npu_format_cast(layer.weight.data, 29) + layer.weight_scale.data = layer.weight_scale.data.flatten() + layer.weight_scale_fp32 = layer.weight_scale.data.to(torch.float32) + layer.weight_offset.data = layer.weight_offset.data.flatten() + + +class TorchairAscendW8A8DynamicFusedMoEMethod: + """FusedMoe method for Ascend W8A8_DYNAMIC. + """ + + def __init__(self): + self.transpose_weight = True + + self.ep_group = get_ep_group() + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + + try: + device_group = get_mc2_group().device_group + # TODO: Try local_rank = ep_group.rank_in_group + local_rank = torch.distributed.get_rank(group=device_group) + backend = device_group._get_backend(torch.device("npu")) + self.moe_all_to_all_group_name = backend.get_hccl_comm_name( + local_rank) + except AttributeError: + self.moe_all_to_all_group_name = "" + + @staticmethod + def get_weight(num_experts: int, intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight"] = torch.empty(num_experts, + 2 * + intermediate_size_per_partition, + hidden_sizes, + dtype=torch.int8) + param_dict["w2_weight"] = torch.empty(num_experts, + hidden_sizes, + intermediate_size_per_partition, + dtype=torch.int8) + return param_dict + + @staticmethod + def get_dynamic_quant_param(num_experts: int, + intermediate_size_per_partition: int, + hidden_sizes: int, + params_dtype: torch.dtype) -> Dict[str, Any]: + param_dict = {} + param_dict["w13_weight_scale"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + param_dict["w13_weight_offset"] = torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + 1, + dtype=params_dtype) + param_dict["w2_weight_scale"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + param_dict["w2_weight_offset"] = torch.empty(num_experts, + hidden_sizes, + 1, + dtype=params_dtype) + return param_dict + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None, + scoring_func: str = "softmax", + e_score_correction_bias: Optional[torch.Tensor] = None, + is_prefill: bool = True, + enable_force_load_balance: bool = True, + log2phy: torch.Tensor = None, + global_redundant_expert_num: int = 0, + shared_experts: Optional[Any] = None, + quantized_x_for_share: Optional[Any] = None, + dynamic_scale_for_share: Optional[Any] = None, + **kwargs, + ) -> torch.Tensor: + assert router_logits.shape[ + 1] == global_num_experts, "Number of global experts mismatch" + + is_deepseek_v3_r1 = global_num_experts == 256 + + # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern + if is_deepseek_v3_r1: + topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k( + router_logits, + k=top_k, # topk currently is 8 + bias=e_score_correction_bias, + k_group=topk_group, # fix: 4 + group_count=num_expert_group, # fix 8 + group_select_mode= + 1, # 0: the maximum in the group; 1: topk2.sum(fix) + renorm=0, # 0: softmax->topk(fix); 1: topk->softmax + norm_type=1, # 0: softmax; 1: sigmoid(fix) + # out_flag=False, # todo new api; should the third output be output + # y2_flag=False, # old api; should the third output be output + routed_scaling_factor=1, + eps=float(1e-20)) + else: + topk_weights, topk_ids = torchair_select_experts( + hidden_states=x, + router_logits=router_logits, + top_k=top_k, + use_grouped_topk=use_grouped_topk, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + ) + + fused_moe_state = get_forward_context().fused_moe_state + shared_gate_up, shared_dequant_scale = None, None + if shared_experts is not None and fused_moe_state == FusedMoEState.MC2: + with npu_stream_switch("moe_secondary", 0): + npu_wait_tensor(quantized_x_for_share, router_logits) + share_up_out, _ = shared_experts.gate_up_proj( + (quantized_x_for_share, dynamic_scale_for_share)) + shared_gate_up, shared_dequant_scale = share_up_out[ + 0], share_up_out[1] + + # this is a naive implementation for experts load balance so as + # to avoid accumulating too much tokens on a single rank. + # currently it is only activated when doing profile runs. + if enable_force_load_balance: + topk_ids = torch.randint_like(topk_ids, 0, global_num_experts) + + topk_weights = topk_weights.to(x.dtype) + if fused_moe_state == FusedMoEState.AllGatherEP: + return torchair_fused_experts_with_allgather( + hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map) + elif fused_moe_state == FusedMoEState.MC2: + return torchair_fused_experts_with_mc2( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + w1_scale=layer.w13_weight_scale_fp32, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + moe_all_to_all_group_name=self.moe_all_to_all_group_name, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + shared_experts=shared_experts, + is_torchair=self.torchair_graph_enabled, + mc2_mask=kwargs.get("mc2_mask", None), + shared_gate_up=shared_gate_up, + shared_dequant_scale=shared_dequant_scale) + elif fused_moe_state in [ + FusedMoEState.AllGather, FusedMoEState.NaiveMulticast + ]: + return torchair_fused_experts(hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map) + else: + # The current implementation of deepseek moe splits hidden_states + # according to tp_size before they are feed into layers module. + # Therefore, all2all is needed no matter how dp/tp is set so as to + # dispatch/combine tokens. + return torchair_fused_experts_with_all2all( + hidden_states=x, + w1=layer.w13_weight, + w1_scale=layer.w13_weight_scale, + w2=layer.w2_weight, + w2_scale=layer.w2_weight_scale, + topk_weights=topk_weights, + topk_ids=topk_ids, + top_k=top_k, + expert_map=expert_map, + ep_group=self.ep_group, + log2phy=log2phy, + global_redundant_expert_num=global_redundant_expert_num, + ) + + def process_weights_after_loading(self, layer): + if self.transpose_weight: + layer.w13_weight.data = layer.w13_weight.data.transpose( + 1, 2).contiguous() + layer.w2_weight.data = layer.w2_weight.data.transpose( + 1, 2).contiguous() + if envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP: + torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ) + layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( + layer.w13_weight_scale.data.shape[0], -1) + layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to( + torch.float32) + layer.w13_weight_offset.data = layer.w13_weight_offset.data.view( + layer.w13_weight_offset.data.shape[0], -1) + layer.w2_weight_scale.data = layer.w2_weight_scale.data.view( + layer.w2_weight_scale.data.shape[0], -1) + layer.w2_weight_offset.data = layer.w2_weight_offset.data.view( + layer.w2_weight_offset.data.shape[0], -1) diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py new file mode 100644 index 0000000..81f2968 --- /dev/null +++ b/vllm_ascend/torchair/torchair_attention.py @@ -0,0 +1,452 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +from dataclasses import dataclass +from typing import List, Optional, Tuple, Type + +import numpy as np +import torch +import torch.nn as nn +import torch_npu +from vllm.attention.backends.abstract import (AttentionImpl, AttentionLayer, + AttentionType) +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.config import VllmConfig +from vllm.utils import cdiv + +from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend, + AscendAttentionMetadataBuilder, + AscendAttentionState, + AscendMetadata) +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.torchair.utils import TorchairCommonAttentionMetadata +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p, + nd_to_nz_2d) + + +class AscendAttentionTorchairBackend(AscendAttentionBackend): + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ASCEND_TORCHAIR" + + @staticmethod + def get_impl_cls() -> Type["AscendAttentionTorchairBackendImpl"]: + return AscendAttentionTorchairBackendImpl + + @staticmethod + def get_metadata_cls() -> Type["AscendTorchairMetadata"]: + return AscendTorchairMetadata + + @staticmethod + def get_builder_cls() -> type["AscendAttentionTorchairMetadataBuilder"]: + return AscendAttentionTorchairMetadataBuilder + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size, num_kv_heads * head_size) + + @staticmethod + def get_bsh_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size, num_kv_heads * head_size) + + +@dataclass +class AscendDecodeMetadata: + # Input positions for rotrary embeddings since for MLA the rotary + # position embeddings are applied inside the attention backend + input_positions: torch.Tensor + block_table: torch.Tensor + seq_lens: torch.Tensor + max_seq_lens: int + seq_lens_list: list[int] + attn_mask: Optional[torch.Tensor] = None + + +@dataclass +class AscendTorchairMetadata(AscendMetadata): + + decode: Optional[AscendDecodeMetadata] = None + + +class AscendAttentionTorchairMetadataBuilder(AscendAttentionMetadataBuilder): + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + super().__init__(vllm_config, device) + self.max_num_blocks_per_req = cdiv( + self.model_config.max_model_len, + self.vllm_config.cache_config.block_size) + self.max_blocks = (self.model_config.max_model_len + + self.vllm_config.cache_config.block_size - + 1) // self.vllm_config.cache_config.block_size + + def _get_graph_runner_block_tables( + self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor: + max_blocks = self.max_blocks + + graph_block_tables = torch.zeros((num_seqs, max_blocks), + dtype=block_tables.dtype, + device=block_tables.device) + + num_blocks = block_tables.size(1) + if num_blocks <= max_blocks: + graph_block_tables[:num_seqs, : + num_blocks] = block_tables[:num_seqs, : + num_blocks] + else: + graph_block_tables[:num_seqs, : + max_blocks] = block_tables[:num_seqs, : + max_blocks] + + return graph_block_tables[:, :max_blocks] + + def build_torchair_graph_dummy( + self, common_attn_metadata: TorchairCommonAttentionMetadata + ) -> AscendTorchairMetadata: + device = self.device + num_reqs = common_attn_metadata.num_reqs + block_table = torch.zeros((num_reqs, self.max_blocks), + dtype=torch.int32, + device=device) + block_table = self._get_graph_runner_block_tables( + num_reqs, block_table) + seq_lens = torch.ones(num_reqs, dtype=torch.int32, device=device) + input_positions = torch.zeros(num_reqs, + dtype=torch.int32, + device=device).long() + slot_mapping = torch.full((num_reqs, ), + PAD_SLOT_ID, + dtype=torch.int32, + device=device) + query_start_loc = torch.full((num_reqs, ), + -1, + dtype=torch.int32, + device=device) + + decode_metadata = AscendDecodeMetadata(input_positions=input_positions, + block_table=block_table, + seq_lens=seq_lens, + seq_lens_list=seq_lens.tolist(), + max_seq_lens=1) + + attn_metadata = AscendTorchairMetadata( + num_actual_tokens=common_attn_metadata.num_actual_tokens, + block_tables=block_table, + query_lens=0, + query_start_loc=query_start_loc, + seq_lens=seq_lens, + slot_mapping=slot_mapping, + attn_state=AscendAttentionState.DecodeOnly, + decode=decode_metadata) + return attn_metadata + + def build( + self, + common_attn_metadata: AscendCommonAttentionMetadata, + model: nn.Module, + ): + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + + block_table = common_attn_metadata.block_table_tensor + block_table[:num_reqs, :self.max_num_blocks_per_req] = ( + block_table[:num_reqs]) + + seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + slot_mapping = common_attn_metadata.slot_mapping_cpu[: + num_actual_tokens].to( + self.device, + non_blocking= + True) + attn_mask = common_attn_metadata.attn_mask + + attn_state = common_attn_metadata.attn_state + if is_310p() and attn_state == AscendAttentionState.PrefillNoCache: + mask_nz = nd_to_nz_2d(attn_mask) + attn_mask = torch_npu.npu_format_cast(mask_nz.contiguous(), 29) + + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu[: + num_reqs + + 1] + query_start_loc = query_start_loc_cpu.to(self.device, + non_blocking=True) + query_lens = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + input_positions = common_attn_metadata.positions[: + num_actual_tokens].long( + ) + + decode_metadata = None + graph_pad_size = common_attn_metadata.graph_pad_size + use_torchair_graph = graph_pad_size > -1 + if common_attn_metadata.attn_state in [ + AscendAttentionState.DecodeOnly, + ]: + max_seq_lens = seq_lens.max().item() + num_seqs = len(seq_lens) + if use_torchair_graph and common_attn_metadata.attn_state in [ + AscendAttentionState.DecodeOnly, + ]: + num_reqs_pad_size = 0 + num_token_pad_size = 0 + if graph_pad_size != 0: + pad_value = 0 + num_token_pad_size = graph_pad_size - num_actual_tokens + num_reqs_pad_size = ( + graph_pad_size // + common_attn_metadata.decode_token_per_req - num_reqs) + pad_value = 1 + padded_seq_lens = seq_lens.tolist() + [pad_value + ] * num_reqs_pad_size + + seq_lens = torch.from_numpy( + np.array(padded_seq_lens).astype(np.int32)) + padding = torch.full((num_token_pad_size, ), + PAD_SLOT_ID, + dtype=slot_mapping.dtype, + device=slot_mapping.device) + slot_mapping = torch.cat([slot_mapping, padding]) + block_table_padding = torch.zeros( + (num_reqs_pad_size, ) + block_table.shape[1:], + dtype=block_table.dtype, + device=block_table.device) + block_table = torch.cat([block_table, block_table_padding], + dim=0) + block_table = self._get_graph_runner_block_tables( + num_seqs + num_reqs_pad_size, block_table) + padding_0 = torch.zeros(num_token_pad_size, + dtype=input_positions.dtype, + device=input_positions.device) + input_positions = torch.cat([input_positions, padding_0]) + + decode_metadata = AscendDecodeMetadata( + input_positions=input_positions, + block_table=block_table, + seq_lens=seq_lens, + seq_lens_list=seq_lens.tolist(), + max_seq_lens=max_seq_lens, + attn_mask=None) + + attn_metadata = AscendTorchairMetadata( + decode=decode_metadata, + num_actual_tokens=num_actual_tokens, + block_tables=block_table, + query_start_loc=query_start_loc, + query_lens=query_lens, + seq_lens=seq_lens, + max_query_len=common_attn_metadata.max_query_len, + slot_mapping=slot_mapping, + attn_mask=attn_mask, + attn_state=attn_state, + enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp) + return attn_metadata + + +class AscendAttentionTorchairBackendImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + **kwargs, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.hidden_size = self.num_heads * self.head_size + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, + dtype=torch.float32, + device="npu") + self.alibi_slopes = alibi_slopes + self.attn_type = attn_type + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.key_cache = None + self.value_cache = None + self.scale_tensor = torch.zeros((), device='npu', dtype=torch.int32) + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AscendTorchairMetadata, + output: Optional[torch.Tensor] = None, + trace_flag: bool = False, + ) -> torch.Tensor: + """Forward pass with Ascend attention. + Args: + query: shape = [batch_size, seq_len, num_heads * head_size] + key: shape = [batch_size, seq_len, num_kv_heads * head_size] + value: shape = [batch_size, seq_len, num_kv_heads * head_size] + kv_cache: shape = [2, num_blocks, block_size, + num_kv_heads, head_size] + key_cache = [num_blocks, block_size, + num_kv_heads, head_size] + value_cache = [num_blocks, block_size, + num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [batch_size * seq_len, num_heads, head_size] + """ + num_tokens = query.shape[0] + use_kv_cache_quant = (kv_cache is not None and len(kv_cache) > 0 + and kv_cache[0].numel() > 0 + and kv_cache[0].dtype == torch.int8) + if output is None: + output = torch.empty(num_tokens, + self.num_heads, + self.head_size, + dtype=query.dtype, + device=query.device) + + if hasattr(layer, 'quant_method') and use_kv_cache_quant: + output = layer.quant_method.apply(layer, query, key, value, + kv_cache, attn_metadata, + self.attn_type, self.scale, + output) + return output.view(num_tokens, self.hidden_size) + + if attn_metadata is None: + return output.view(num_tokens, self.hidden_size) + + output = output.view(-1, self.num_heads, self.head_size) + + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 + attn_type = self.attn_type + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "AscendAttentionTorchairBackendImpl") + + if kv_cache is not None and kv_cache[0].numel() > 0: + key_cache, value_cache = kv_cache[0], kv_cache[1] + slots = attn_metadata.slot_mapping + + block_size = self.scale_tensor + key_cache.shape[1] + slots_indices = slots.reshape(-1, 1) + block_indices = slots_indices // block_size + slots_indices = slots_indices % block_size + indices = torch.cat((block_indices, slots_indices), dim=1) + torch_npu.npu_scatter_nd_update_(key_cache, indices, key) + torch_npu.npu_scatter_nd_update_(value_cache, indices, value) + + if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + mask = attn_metadata.attn_mask + + # View q k v to BSH. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + + if is_310p(): + # align q k v output tensors + query = aligned_16(query) + key = aligned_16(key) + value = aligned_16(value) + output = aligned_16(output) + + # do reformat in case of broadcasted tensors + mask = mask.repeat(attn_metadata.seq_lens.size(0), 1, 1, 1) + mask = torch_npu.npu_format_cast(mask.contiguous(), + ACL_FORMAT_FRACTAL_NZ) + + torch_npu._npu_flash_attention(query=query, + key=key, + value=value, + mask=mask, + seq_len=attn_metadata.seq_lens, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + out=output) + output = output[:num_tokens, :, :] + elif attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit: + assert attn_metadata is not None + assert attn_metadata.attn_mask is not None + compress_mask = attn_metadata.attn_mask + torch_npu._npu_flash_attention_qlens( + query=query, + key_cache=self.key_cache, + value_cache=self.value_cache, + block_table=attn_metadata.block_tables, + mask=compress_mask, + seq_len=attn_metadata.query_lens, + context_lens=attn_metadata.seq_lens, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + out=output) + elif attn_metadata.attn_state == AscendAttentionState.DecodeOnly: + decode_meta = attn_metadata.decode + assert decode_meta is not None + seq_lens = decode_meta.seq_lens_list + block_table = decode_meta.block_table + block_size = key_cache.shape[1] + query = query.view(num_tokens, 1, + self.num_heads * self.head_size).contiguous() + output = torch_npu.npu_incre_flash_attention( + query, + key_cache, + value_cache, + num_key_value_heads=self.num_kv_heads, + num_heads=self.num_heads, + actual_seq_lengths=seq_lens, + scale_value=self.scale, + block_table=block_table, + input_layout='BSH', + block_size=block_size) + else: + raise NotImplementedError( + "Torchair graph mode with non-MLA attention backend is still experimental." + "v1 scheduler(chunked prefill) is not supported at this moment. Please" + "setting 'ascend_scheduler_config':{'enabled':true} in additional_config" + "to use ascend scheduler.") + + return output.view(num_tokens, self.hidden_size) diff --git a/vllm_ascend/torchair/torchair_mla.py b/vllm_ascend/torchair/torchair_mla.py new file mode 100644 index 0000000..30ef293 --- /dev/null +++ b/vllm_ascend/torchair/torchair_mla.py @@ -0,0 +1,1321 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar + +import numpy as np +import torch +import torch.nn as nn +import torch_npu +from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer, + AttentionMetadata, + MLAAttentionImpl) +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.config import VllmConfig, get_current_vllm_config +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.linear import (LinearBase, + UnquantizedLinearMethod) +from vllm.utils import cdiv, round_down + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata, + split_decodes_and_prefills) +from vllm_ascend.multistream.base import MSAttentionMetadataSplitConfig +from vllm_ascend.multistream.context import get_multistream_comm_context +from vllm_ascend.multistream.ms_split import model_input_split_v1_mla_attn +from vllm_ascend.ops.attention import vanilla_chunked_prefill_mla +from vllm_ascend.torchair.utils import (TorchairCommonAttentionMetadata, + npu_stream_switch, npu_wait_tensor) +from vllm_ascend.utils import npu_prefetch +from vllm_ascend.worker.npu_input_batch import InputBatch + +if TYPE_CHECKING: + from vllm.v1.core.sched.output import SchedulerOutput + + +class AscendMLATorchairBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @staticmethod + def get_name() -> str: + return "ASCEND_MLA_TORCHAIR" + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return AscendMLATorchairMetadata + + @staticmethod + def get_builder_cls(): + return AscendMLATorchairMetadataBuilder + + @staticmethod + def get_kv_cache_shape(num_blocks: int, block_size: int, num_kv_heads: int, + head_size: int) -> tuple[int, ...]: + return (num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def get_impl_cls() -> Type["MLAAttentionImpl"]: + return AscendMLATorchairImpl + + +@dataclass +class AscendMLATorchairPrefillMetadata: + """ Prefill Specific Metadata for Ascend""" + + @dataclass + class TorchairChunkedContextMetadata: + # New for MLA (compared to FlashAttention) + # For handling chunked prefill + cu_seq_lens: torch.Tensor + starts: torch.Tensor + seq_tot: list[int] + max_seq_lens: list[int] + workspace: torch.Tensor + chunk_seq_lens: torch.Tensor + + attn_mask: torch.Tensor + query_lens: list[int] + seq_lens: list[int] + context_lens: torch.Tensor + input_positions: torch.Tensor + query_start_loc: torch.Tensor + block_table: torch.Tensor + max_query_len: int + max_seq_lens: int + chunked_context: Optional[TorchairChunkedContextMetadata] = None + sin: torch.Tensor = None + cos: torch.Tensor = None + + +@dataclass +class AscendMLATorchairDecodeMetadata: + # Input positions for rotrary embeddings since for MLA the rotary + # position embeddings are applied inside the attention backend + input_positions: torch.Tensor + block_table: torch.Tensor + seq_lens: torch.Tensor + max_seq_lens: int + seq_lens_list: list[int] + actual_seq_lengths_q: Optional[list[int]] = None + attn_mask: Optional[torch.Tensor] = None + sin: torch.Tensor = None + cos: torch.Tensor = None + + +@dataclass +class AscendMLATorchairMetadata: + """Metadata for MLACommon. + + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + num_actual_tokens: int # Number of tokens excluding padding. + slot_mapping: torch.Tensor + query_start_loc: torch.Tensor + seq_lens: torch.Tensor + block_tables: torch.Tensor + + # New for MLA (compared to FlashAttention) + # For handling prefill decode split + num_decodes: int + num_decode_tokens: int + num_prefills: int + + # For logging. + num_input_tokens: int = 0 # Number of tokens including padding. + + query_lens: Optional[list[int]] = None + # The dimension of the attention heads + head_dim: Optional[int] = None + attn_mask: torch.Tensor = None + # chunked prefill by default if no attn_states passed + attn_state: AscendAttentionState = AscendAttentionState.ChunkedPrefill + + decode: Optional[AscendMLATorchairDecodeMetadata] = None + prefill: Optional[AscendMLATorchairPrefillMetadata] = None + enable_dbo_across_dp: bool = False + + def __post_init__(self): + pass + # supported_head_sizes = AscendMLABackend.get_supported_head_sizes() + # if self.head_dim is not None and self.head_dim \ + # not in supported_head_sizes: + # raise ValueError( + # f"Only {supported_head_sizes} are supported for head_dim,", + # f"received {self.head_dim}.") + + def split_metadata_for_multistream( + self, + ms_split_config: MSAttentionMetadataSplitConfig, + ) -> list["AscendMLATorchairMetadata"]: + """Split metadata for multi-stream with AscendMLATorchairMetadata""" + return model_input_split_v1_mla_attn( + ms_split_config=ms_split_config, + attn_metadata=self, + _metadata_cls=AscendMLATorchairMetadata, + ) + + +M = TypeVar("M", bound=AscendMLATorchairMetadata) + + +class AscendMLATorchairMetadataBuilder: + """ + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + + # _attn_mask_builder = None + def __init__(self, + vllm_config: VllmConfig, + device: torch.device, + metadata_cls: Optional[AscendMLATorchairMetadata] = None): + self.metadata_cls: Optional[AscendMLATorchairMetadata] = metadata_cls \ + if metadata_cls is not None else AscendMLATorchairMetadata # type: ignore + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.device = device + scheduler_config = vllm_config.scheduler_config + self.block_size = vllm_config.cache_config.block_size + self.max_blocks = (vllm_config.model_config.max_model_len + + self.block_size - 1) // self.block_size + self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled + if self.chunked_prefill_enabled: + self.chunked_prefill_workspace_size = min( + # Max sure there is enough for 8 full length request or at least + # 4 pages of cache per request + max(8 * self.model_config.max_model_len, + 4 * scheduler_config.max_num_seqs * self.block_size), + # For long-context models try not to over-allocate limiting + # kv-cache space, limiting it to 64k tokens, + # which would result in the workspace being: + # 2*(576)*(64*1024) = 144mb + # (assuming 576 MLA head dim, and fp16) + # which would result in up-projected context being + # 2*(192*128)*(64*1024) = 3gb + # (assuming 192 QK head dim, 128 heads, and fp16) + 128 * 1024) + assert self.chunked_prefill_workspace_size >= \ + scheduler_config.max_num_seqs * self.block_size + self.chunked_prefill_workspace = torch.empty( + (self.chunked_prefill_workspace_size, + self.model_config.get_head_size()), + dtype=self.model_config.dtype, + device=device, + ) + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.rope_dim = self.model_config.hf_text_config.qk_rope_head_dim + self.cos_cache = None + self.sin_cache = None + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + # We now want to reorder the batch so that the "decode" requests are at + # the front and the "prefill" requests are at the using the least amount + # swaps possible. (NOTE for now we loosely use "decode" to mean requests + # where attention is likely memory-bound and "prefill" to mean requests + # where attention is likely compute-bound, TODO(lucas): figure out a + # better naming here) + decodes = [] + prefills = [] + + for i, req_id in enumerate(input_batch.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + # For torch air graph mode we treat spec decoding as decode. + if self.torchair_graph_enabled: + if num_tokens - num_spec_tokens == 1: + decodes.append(i) + else: + prefills.append(i) + # For eager mode we treat spec decoding as chunked prefill. + else: + if num_tokens == 1: + decodes.append(i) + else: + prefills.append(i) + + # We hope that this is fairly minimal since decodes + # should be around for a number of iterations so hopefully they are + # relatively stationary (and new request are generally appended to the + # persistent batch so already should be at the back) + # To achieve this we loop over the decodes in descending order and + # the prefills in ascending order. We swap decodes from the "back" + # i.e. past where the last decode should be in the reodorered with + # prefills from the front of the batch. + # `decodes` and `prefills` are already in ascending order just based on + # the above loop + num_decodes = len(decodes) + num_prefills = len(prefills) + first_prefill = 0 + modified_batch = False + + for i in range(1, min(num_decodes, num_prefills) + 1): + # If the decode is at the "back" of the batch, i, we can swap it + # with the prefill closest to the front of the batch + if decodes[num_decodes - i] >= num_decodes: + input_batch.swap_states(prefills[first_prefill], + decodes[num_decodes - i]) + first_prefill += 1 + modified_batch = True + else: + break + + # Save for next `build` call + # TODO(lucas): this is a bit of a hack, we should probably have a + # better way of doing this + return modified_batch + + def _get_graph_runner_block_tables( + self, num_seqs: int, block_tables: torch.Tensor) -> torch.Tensor: + max_blocks = self.max_blocks + + graph_block_tables = torch.zeros((num_seqs, max_blocks), + dtype=block_tables.dtype, + device=block_tables.device) + + num_blocks = block_tables.size(1) + if num_blocks <= max_blocks: + graph_block_tables[:num_seqs, : + num_blocks] = block_tables[:num_seqs, : + num_blocks] + else: + graph_block_tables[:num_seqs, : + max_blocks] = block_tables[:num_seqs, : + max_blocks] + + return graph_block_tables[:, :max_blocks] + + def build_torchair_graph_dummy( + self, + common_attn_metadata: TorchairCommonAttentionMetadata, + ) -> AscendMLATorchairMetadata: + device = self.device + num_reqs = common_attn_metadata.num_reqs + block_table = torch.zeros((num_reqs, self.max_blocks), + dtype=torch.int32, + device=device) + block_table = self._get_graph_runner_block_tables( + num_reqs, block_table) + num_tokens = num_reqs * common_attn_metadata.decode_token_per_req + seq_lens = torch.zeros(num_reqs, dtype=torch.int32, device=device) + seq_lens_list = [0] * num_reqs + input_positions = torch.zeros(num_tokens, + dtype=torch.int32, + device=device).long() + slot_mapping = torch.full((num_tokens, ), + PAD_SLOT_ID, + dtype=torch.int32, + device=device) + query_start_loc = torch.full((num_reqs, ), + -1, + dtype=torch.int32, + device=device) + sin = torch.ones(num_tokens, + 1, + 1, + self.rope_dim, + dtype=self.model_config.dtype, + device=device) + cos = torch.ones(num_tokens, + 1, + 1, + self.rope_dim, + dtype=self.model_config.dtype, + device=device) + if self.vllm_config.speculative_config is not None and\ + self.vllm_config.speculative_config.method == 'deepseek_mtp': + attn_state = AscendAttentionState.SpecDecoding + num_decode_tokens = 2 + else: + attn_state = AscendAttentionState.DecodeOnly + num_decode_tokens = 1 + decode_metadata = AscendMLATorchairDecodeMetadata( + input_positions=input_positions, + block_table=block_table, + seq_lens=seq_lens, + seq_lens_list=seq_lens_list, + max_seq_lens=1, + attn_mask=common_attn_metadata.spec_attn_mask, + actual_seq_lengths_q=common_attn_metadata. + actual_seq_lengths_q[:num_reqs], + sin=sin, + cos=cos, + ) + return self.metadata_cls( # type: ignore + num_input_tokens=common_attn_metadata.num_actual_tokens, + num_actual_tokens=common_attn_metadata.num_actual_tokens, + slot_mapping=slot_mapping, + head_dim=self.model_config.get_head_size(), + num_decodes=1, + num_decode_tokens=num_decode_tokens, + num_prefills=0, + attn_mask=common_attn_metadata.attn_mask, + attn_state=attn_state, + prefill=None, + decode=decode_metadata, + query_start_loc=query_start_loc, + seq_lens=seq_lens, + block_tables=block_table, + ) + + def build( + self, + common_attn_metadata: AscendCommonAttentionMetadata, + model: nn.Module, + ) -> AscendMLATorchairMetadata: + num_reqs = common_attn_metadata.num_reqs + num_actual_tokens = common_attn_metadata.num_actual_tokens + query_start_loc = common_attn_metadata.query_start_loc + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + if self.torchair_graph_enabled and common_attn_metadata.attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: + decode_threshold = common_attn_metadata.decode_token_per_req + else: + # TODO(xyx): remove the if condition after mla supports torch mode speculative decoding + decode_threshold = 1 + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \ + split_decodes_and_prefills(common_attn_metadata, decode_threshold=decode_threshold) + assert num_decodes + num_prefills == num_reqs + assert num_decode_tokens + num_prefill_tokens == num_actual_tokens + + # Note(simon): be careful about the CPU <> GPU memory movement in this + # function. We should avoid GPU -> CPU sync as much as possible because + # it blocks on all previous kernels. + device = self.device + + block_table = (common_attn_metadata.block_table_tensor[:num_reqs]) + slot_mapping = common_attn_metadata.slot_mapping_cpu[: + num_actual_tokens].to( + device, + non_blocking= + True) + input_positions = common_attn_metadata.positions[: + num_actual_tokens].long( + ) + + if self.cos_cache is None: + self.cos_cache = model.model.layers[ + 0].self_attn.rotary_emb.cos_cached + self.sin_cache = model.model.layers[ + 0].self_attn.rotary_emb.sin_cached + if self.cos_cache.dtype != self.model_config.dtype: # type: ignore + self.cos_cache = self.cos_cache.to( # type: ignore + self.model_config.dtype) # type: ignore + self.sin_cache = self.sin_cache.to( # type: ignore + self.model_config.dtype) # type: ignore + + query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + query_lens = query_seq_lens_cpu[:num_reqs] + seq_lens = common_attn_metadata.seq_lens_cpu[:num_reqs] + num_computed_tokens_cpu = (seq_lens - query_lens) + + prefill_metadata = None + chunked_context_metadata = None + if num_prefills > 0: + reqs_start = num_decodes # prefill_start + tokens_start = num_decode_tokens + max_query_len = query_lens[tokens_start:].max().item() + max_seq_lens = seq_lens[tokens_start:].max().item() + prefill_query_start_loc = query_start_loc[ + reqs_start:] - query_start_loc[reqs_start] + + context_lens_cpu = num_computed_tokens_cpu[reqs_start:num_reqs] + max_context_len_cpu = context_lens_cpu.max().item() + num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item() + if self.chunked_prefill_enabled and max_context_len_cpu > 0: + max_context_chunk = (self.chunked_prefill_workspace_size // + num_prefills_with_context_cpu) + max_context_chunk = round_down(max_context_chunk, + self.block_size) + + assert max_context_chunk > 0 + num_chunks = cdiv(max_context_len_cpu, max_context_chunk) + chunk_starts = torch.arange(num_chunks, dtype=torch.int32) \ + .unsqueeze(1).expand(-1, num_prefills) * max_context_chunk + chunk_ends = torch.min(context_lens_cpu.unsqueeze(0), + chunk_starts + max_context_chunk) + chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0) + cu_seq_lens_cpu = torch.zeros(num_chunks, + num_prefills + 1, + dtype=torch.int32, + pin_memory=True) + torch.cumsum(chunk_seq_lens, + dim=1, + out=cu_seq_lens_cpu[:, 1:], + dtype=torch.int32) + chunked_context_metadata = \ + AscendMLATorchairPrefillMetadata.TorchairChunkedContextMetadata( + cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True), + starts=chunk_starts.to(device, non_blocking=True), + seq_tot=chunk_seq_lens.sum(dim=1).tolist(), + max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(), + chunk_seq_lens=chunk_seq_lens, + workspace=self.chunked_prefill_workspace, + ) + prefill_input_positions = input_positions[tokens_start:] + cos = self.cos_cache[ + prefill_input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + sin = self.sin_cache[ + prefill_input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + prefill_metadata = AscendMLATorchairPrefillMetadata( + attn_mask=common_attn_metadata.attn_mask, + query_lens=query_lens[tokens_start:], + seq_lens=seq_lens, + context_lens=seq_lens[tokens_start:], + input_positions=prefill_input_positions, + block_table=block_table[reqs_start:, ...], + max_query_len=max_query_len, + max_seq_lens=max_seq_lens, + query_start_loc=prefill_query_start_loc, + chunked_context=chunked_context_metadata, + sin=sin, + cos=cos, + ) + + decode_metadata = None + graph_pad_size = common_attn_metadata.graph_pad_size + use_torchair_graph = graph_pad_size != -1 + if num_decodes > 0: + actual_seq_lengths_q = query_start_loc[1:num_decodes + 1].tolist() + max_seq_lens = seq_lens[:num_decodes].max().item() + seq_lens = seq_lens[:num_decode_tokens] + input_positions = input_positions[:num_decode_tokens] + block_table = block_table[:num_decode_tokens, ...] + num_token_pad_size = 0 + if use_torchair_graph and common_attn_metadata.attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: + num_reqs_pad_size = 0 + if graph_pad_size != 0: + pad_value = 0 + num_token_pad_size = graph_pad_size - num_decode_tokens + num_reqs_pad_size = ( + graph_pad_size // + common_attn_metadata.decode_token_per_req - num_reqs) + padded_seq_lens = seq_lens.tolist( + ) + [pad_value] * num_reqs_pad_size + else: + padded_seq_lens = seq_lens.tolist() + + seq_lens = torch.from_numpy( + np.array(padded_seq_lens).astype(np.int32)) + seq_lens_list = padded_seq_lens + slot_padding = torch.full((num_token_pad_size, ), + PAD_SLOT_ID, + dtype=slot_mapping.dtype, + device=slot_mapping.device) + slot_mapping = torch.cat([slot_mapping, slot_padding]) + block_table_padding = torch.zeros( + (num_reqs_pad_size, ) + block_table.shape[1:], + dtype=block_table.dtype, + device=block_table.device) + block_table = torch.cat([block_table, block_table_padding], + dim=0) + block_table = self._get_graph_runner_block_tables( + num_reqs + num_reqs_pad_size, block_table) + position_padding = torch.zeros(num_token_pad_size, + dtype=input_positions.dtype, + device=input_positions.device) + input_positions = torch.cat( + [input_positions, position_padding]) + actual_seq_lengths_q = ( + actual_seq_lengths_q + common_attn_metadata. + actual_seq_lengths_q[num_reqs:num_reqs + + num_reqs_pad_size]) + else: + seq_lens_list = seq_lens.tolist() + # mtp torchair + PD scenario, last element of actual_seq_lengths_q must equal to batch_size(num_tokens) + batch_size = num_decode_tokens + num_token_pad_size + if actual_seq_lengths_q[-1] != batch_size \ + and common_attn_metadata.attn_state == AscendAttentionState.SpecDecoding: + actual_seq_lengths_q[-1] = batch_size + + cos = self.cos_cache[input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + sin = self.sin_cache[input_positions].unsqueeze( # type: ignore + 1).unsqueeze(2) + + decode_metadata = AscendMLATorchairDecodeMetadata( + input_positions=input_positions, + block_table=block_table, + seq_lens=seq_lens, + seq_lens_list=seq_lens_list, + max_seq_lens=max_seq_lens, + attn_mask=common_attn_metadata.spec_attn_mask, + actual_seq_lengths_q=actual_seq_lengths_q, + sin=sin, + cos=cos) + + return self.metadata_cls( # type: ignore + num_actual_tokens=num_actual_tokens, + query_lens=query_lens.tolist(), + slot_mapping=slot_mapping, + head_dim=self.model_config.get_head_size(), + num_decodes=num_decodes, + num_decode_tokens=num_decode_tokens, + num_prefills=num_prefills, + attn_mask=common_attn_metadata.attn_mask, + attn_state=common_attn_metadata.attn_state, + prefill=prefill_metadata, + decode=decode_metadata, + query_start_loc=query_start_loc, + block_tables=block_table, + seq_lens=seq_lens, + enable_dbo_across_dp=common_attn_metadata.enable_dbo_across_dp, + ) + + +class AscendMLATorchairImpl(MLAAttentionImpl): + """ + NOTE: Please read the comment at the top of the file before trying to + understand this class + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + logits_soft_cap: Optional[float], + attn_type: str, + kv_sharing_target_layer_name: Optional[str], + **kwargs, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + + # MLA Args + self.q_lora_rank = kwargs['q_lora_rank'] + self.kv_lora_rank = kwargs['kv_lora_rank'] + self.qk_nope_head_dim = kwargs['qk_nope_head_dim'] + self.qk_rope_head_dim = kwargs['qk_rope_head_dim'] + self.qk_head_dim = kwargs['qk_head_dim'] + self.v_head_dim = kwargs['v_head_dim'] + self.rotary_emb = kwargs['rotary_emb'] + self.q_proj = kwargs['q_proj'] + self.kv_b_proj = kwargs['kv_b_proj'] + self.o_proj = kwargs['o_proj'] + self.kv_a_proj_with_mqa = kwargs.get('kv_a_proj_with_mqa', None) + self.kv_a_layernorm = kwargs.get('kv_a_layernorm', None) + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.tp_size = get_tensor_model_parallel_world_size() + + ascend_config = get_ascend_config() + self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled + self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz + self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp + self.running_in_graph = False + + # Adapt torch air graph mode with spec decoding. + speculative_config = get_current_vllm_config().speculative_config + if speculative_config is not None: + self.spec_token_num = speculative_config.num_speculative_tokens + assert self.spec_token_num > 0 + + def _v_up_proj_and_o_proj(self, x, enable_multistream_mla: bool = False): + # Convert from (B, N, L) to (N, B, L) + x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1) + # Multiply (N, B, L) x (N, L, V) -> (N, B, V) + x = torch.bmm(x, self.W_UV) + # Convert from (N, B, V) to (B, N * V) + x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim) + if hasattr(self, "running_in_graph") and not self.running_in_graph: + return x + MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 # 16MB + npu_prefetch(self.o_proj.weight, + x, + max_size=MAX_O_PROJ_PREFETCH_SIZE, + enabled=enable_multistream_mla) + return self.o_proj(x, is_prefill=False)[0] + + # Return `ql_nope`, `q_pe` + def _q_proj_and_k_up_proj(self, x): + q_nope, q_pe = self.q_proj(x)[0]\ + .view(-1, self.num_heads, self.qk_head_dim)\ + .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1) + + # Convert from (B, N, P) to (N, B, P) + q_nope = q_nope.transpose(0, 1) + # Multiply (N, B, P) x (N, P, L) -> (N, B, L) + ql_nope = torch.bmm(q_nope, self.W_UK_T) + # Convert from (N, B, L) to (B, N, L) + return ql_nope.transpose(0, 1), q_pe + + def process_weights_after_loading(self, act_dtype: torch.dtype): + + def get_layer_weight(layer): + WEIGHT_NAMES = ("weight", "qweight", "weight_packed") + for attr in WEIGHT_NAMES: + if hasattr(layer, attr): + return getattr(layer, attr) + raise AttributeError( + f"Layer '{layer}' has no recognized weight attribute:" + f" {WEIGHT_NAMES}.") + + def get_and_maybe_dequant_weights(layer: LinearBase): + if not isinstance(layer.quant_method, UnquantizedLinearMethod): + # NOTE: This should only be used offline, since it's O(N^3) + eye = torch.eye(layer.input_size_per_partition, + dtype=act_dtype, + device=get_layer_weight(layer).device) + dequant_weights = layer.quant_method.apply(layer, + eye, + bias=None) + del eye + # standardize to (output, input) + return dequant_weights.T + return layer.weight + + # we currently do not have quantized bmm's which are needed for + # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # the bmm's in 16-bit, the extra memory overhead of this is fairly low + kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T + assert kv_b_proj_weight.shape == ( + self.kv_lora_rank, + self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), ( + f"{kv_b_proj_weight.shape=}, " + f"{self.kv_lora_rank=}, " + f"{self.num_heads=}, " + f"{self.qk_nope_head_dim=}, " + f"{self.v_head_dim=}") + kv_b_proj_weight = kv_b_proj_weight.view( + self.kv_lora_rank, + self.num_heads, + self.qk_nope_head_dim + self.v_head_dim, + ) + + W_UK, W_UV = kv_b_proj_weight.split( + [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + + # Convert from (L, N, V) to (N, L, V) + self.W_UV = W_UV.transpose(0, 1).contiguous() + # Convert from (L, N, P) to (N, P, L) + self.W_UK_T = W_UK.permute(1, 2, 0).contiguous() + + # Waiting for BMM NZ support + # self.W_UV.data = torch_npu.npu_format_cast(self.W_UV.data, 29) + # self.W_UK_T.data = torch_npu.npu_format_cast(self.W_UK_T.data, 29) + + def _compute_prefill_context( + self, + query: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], + rope_dim: int, + attn_metadata: AscendMLATorchairMetadata, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + ): + assert len(kv_c_and_k_pe_cache) > 1 + prefill_metadata = attn_metadata.prefill + if prefill_metadata is None or prefill_metadata.chunked_context is None: + return prefix_output, prefix_lse + + iters = len(prefill_metadata.chunked_context.seq_tot) + q_pe = query[..., self.qk_nope_head_dim:] + q_nope = query[..., :self.qk_nope_head_dim] + + seq_len1 = torch.tensor(prefill_metadata.query_lens, dtype=torch.int32) + cache_kv_c = kv_c_and_k_pe_cache[0] + cache_k_pe = kv_c_and_k_pe_cache[1] + num_heads = cache_k_pe.size(2) + latent_kv_dim = kv_c_and_k_pe_cache[0].size(-1) + for i in range(iters): + toks = prefill_metadata.chunked_context.seq_tot[i] + + seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i] + seq_len = torch.stack([seq_len1, seq_len2]) + kv_c_normed = torch.empty(toks, + num_heads, + latent_kv_dim, + dtype=query.dtype, + device=query.device) + k_pe = torch.empty(toks, + num_heads, + rope_dim, + dtype=query.dtype, + device=query.device) + + torch_npu.atb.npu_paged_cache_load( + cache_kv_c, + cache_k_pe, + prefill_metadata.block_table, + seq_len2.to(query.device), + seq_starts=prefill_metadata.chunked_context.starts[i], + key=kv_c_normed, + value=k_pe, + ) + + kv_c_normed = kv_c_normed.squeeze() + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \ + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim) + k_nope, v = kv_nope\ + .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.expand((*k_nope.shape[:-1], -1)) + mask = torch.triu( + torch.ones(512, 512, device=query.device, dtype=query.dtype), + 1) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=v, + mask=mask, + seqlen=seq_len, + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=prefix_output, + prev_lse=prefix_lse, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="no_mask", + input_layout="type_bsnd", + calc_type="calc_type_default", + output=prefix_output, + softmax_lse=prefix_lse) + return prefix_output, prefix_lse + + def _forward_prefill( + self, + query: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], + attn_metadata: AscendMLATorchairMetadata, + ) -> torch.Tensor: + assert attn_metadata.prefill is not None + assert len(kv_c_and_k_pe_cache) > 1 + + num_tokens = query.size(0) + attn_output = torch.empty(num_tokens, + self.num_heads, + self.v_head_dim, + dtype=query.dtype, + device=query.device) + k_nope, value = self.kv_b_proj(kv_c_normed)[0].view( + -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim).split( + [self.qk_nope_head_dim, self.v_head_dim], dim=-1) + k_pe = k_pe.expand((*k_nope.shape[:-1], -1)) + # Here is only 2 possibility of input, ChunkedPrefill or PrefillNoCache + ascend_config = get_ascend_config() + + if attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit + ] and not ascend_config.chunked_prefill_for_mla: + attn_output_torch = torch.empty(num_tokens, + self.num_heads * self.v_head_dim, + dtype=query.dtype, + device=query.device) + # current requests is chunked in prefill, disable flash attention with chunked prefill + vanilla_chunked_prefill_mla( + output=attn_output_torch, + query=query, + kv_cache=kv_c_and_k_pe_cache, + block_tables=attn_metadata.prefill.block_table, + query_lens=attn_metadata.prefill.query_lens, + context_lens=attn_metadata.prefill.context_lens, + kv_b_proj=self.kv_b_proj, + max_query_len=attn_metadata.prefill.max_query_len, + max_context_len=attn_metadata.prefill.max_seq_lens, + nope_dim=self.qk_nope_head_dim, + rope_dim=self.qk_rope_head_dim, + v_head_dim=self.v_head_dim, + scale=self.scale, + alibi_slopes=None, + causal=True) + elif attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit + ]: + attn_lse = torch.empty(self.num_heads, + num_tokens, + dtype=torch.float32, + device=query.device) + q_pe = query[..., self.qk_nope_head_dim:] + q_nope = query[..., :self.qk_nope_head_dim] + mask = torch.triu( + torch.ones(512, 512, device=query.device, dtype=query.dtype), + 1) # 512: mask only support 512 + if attn_metadata.num_prefills > 1: + mask = mask.unsqueeze(0).repeat(attn_metadata.num_prefills, 1, + 1) + torch_npu.atb.npu_ring_mla( + q_nope=q_nope, + q_rope=q_pe, + k_nope=k_nope, + k_rope=k_pe, + value=value, + mask=mask, + seqlen=torch.tensor(attn_metadata.prefill.query_lens, + dtype=torch.int32), + head_num=self.num_heads, + kv_head_num=self.num_heads, + pre_out=None, + prev_lse=None, + qk_scale=self.scale, + kernel_type="kernel_type_high_precision", + mask_type="mask_type_triu", + input_layout="type_bsnd", + calc_type="calc_type_first_ring", + output=attn_output, + softmax_lse=attn_lse) + attn_output, attn_lse = self._compute_prefill_context( \ + query, kv_c_and_k_pe_cache, self.qk_rope_head_dim, attn_metadata, attn_output, attn_lse) + + elif attn_metadata.attn_state == AscendAttentionState.PrefillNoCache: + key = torch.cat((k_nope, k_pe), dim=-1) + torch_npu._npu_flash_attention( + query=query, + key=key, + value=value, + mask=attn_metadata.attn_mask, + seq_len=attn_metadata.prefill.context_lens, + scale_value=self.scale, + num_heads=self.num_heads, + num_kv_heads=self.num_heads, + out=attn_output) + attn_output = attn_output.view(-1, self.num_heads, self.v_head_dim) + else: + raise RuntimeError( + "Unexpected path reached, AscendMLATorchairImpl should only have PrefillNoCache, PrefillCacheHit, ChunkedPrefill and SpecDecoding scenario in forward prefill, please file a bug to vllm-ascend !" + ) + attn_output = attn_output.reshape( + [num_tokens, self.num_heads * self.v_head_dim]) + if attn_metadata.attn_state in [ + AscendAttentionState.ChunkedPrefill, + AscendAttentionState.SpecDecoding, + AscendAttentionState.PrefillCacheHit + ] and not ascend_config.chunked_prefill_for_mla: + attn_output = attn_output_torch + + return attn_output + + def exec_kv( + self, + hidden_states: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + + B = hidden_states.shape[0] + N = self.num_kv_heads + S = 1 + kv = self.kv_a_proj_with_mqa(hidden_states)[0] + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_NZ" if self.enable_kv_nz else "PA" + k_pe, k_nope, _, _ = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + ) + return k_pe, k_nope, kv + + def exec_kv_prefill( + self, + hidden_states: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + kv_cache: Tuple, + slots: torch.Tensor, + ): + + B = hidden_states.shape[0] + N = self.num_kv_heads + S = 1 + kv = self.kv_a_proj_with_mqa(hidden_states)[0] + # npu_kv_rmsnorm_rope_cache needs [B, N, S, D] + kv = kv.view(B, N, S, self.kv_lora_rank + self.qk_rope_head_dim) + cache_mode = "PA_BLK_NZ" if self.enable_kv_nz else "PA" + _, _, k_pe, k_nope = torch_npu.npu_kv_rmsnorm_rope_cache( + kv, + self.kv_a_layernorm.weight, + cos, + sin, + slots.to(torch.int64), + kv_cache[1], + kv_cache[0], + epsilon=self.kv_a_layernorm.variance_epsilon, + cache_mode=cache_mode, + is_output_kv=True, + ) + return k_pe, k_nope + + def rope_single( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + B, N, D = x.shape + S = 1 + x = x.view(B, N, S, D) + x = torch_npu.npu_interleave_rope(x, cos, sin) + return x.view(B, N, D) + + def _forward_decode( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + kv_c_and_k_pe_cache: Tuple[torch.Tensor], + attn_metadata: AscendMLATorchairMetadata, + enable_multistream_mla: bool = False, + ) -> torch.Tensor: + decode_meta = attn_metadata.decode + assert decode_meta is not None + num_tokens = q_nope.size(0) + if self.running_in_graph or self.running_chunkprefilll_with_torchair: + # shape of knope/k_pe for npu graph mode should be: + # [num_blocks, num_kv_heads, block_size, self.kv_lora_rank/self.qk_rope_head_dim] + block_size = kv_c_and_k_pe_cache[0].shape[1] + actual_seq_lengths = None + if self.enable_kv_nz: + k_nope = k_nope.view(-1, self.num_kv_heads, + self.kv_lora_rank // 16, block_size, 16) + k_pe = k_pe.view(-1, self.num_kv_heads, + self.qk_rope_head_dim // 16, block_size, 16) + input_layout = "BSND" + else: + k_nope = k_nope.view(-1, self.num_kv_heads, block_size, + self.kv_lora_rank) + k_pe = k_pe.view(-1, self.num_kv_heads, block_size, + self.qk_rope_head_dim) + input_layout = "BNSD" + + if attn_metadata.attn_state == AscendAttentionState.SpecDecoding: + assert num_tokens % self.spec_token_num == 0 + input_layout = "TND" + # [bs * q_seq_len, num_heads_per_rank, dim] + q_nope = q_nope.view(num_tokens, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, -1) + sparse_mode = 3 + spec_attn_mask = attn_metadata.decode.attn_mask # type:ignore + actual_seq_lengths = decode_meta.actual_seq_lengths_q + else: + if self.enable_kv_nz: + q_nope = q_nope.view(num_tokens, 1, self.num_heads, -1) + q_pe = q_pe.view(num_tokens, 1, self.num_heads, -1) + else: + q_nope = q_nope.view(num_tokens, self.num_heads, 1, -1) + q_pe = q_pe.view(num_tokens, self.num_heads, 1, -1) + sparse_mode = 0 + spec_attn_mask = None + + attn_output, _ = torch_npu.npu_fused_infer_attention_score( + q_nope, + k_nope, + k_nope, + query_rope=q_pe, + key_rope=k_pe, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + input_layout=input_layout, + atten_mask=spec_attn_mask, + sparse_mode=sparse_mode, + scale=self.scale, + antiquant_mode=0, + antiquant_scale=None, + block_table=decode_meta.block_table, + block_size=block_size, + actual_seq_lengths_kv=decode_meta.seq_lens_list, + actual_seq_lengths=actual_seq_lengths) + else: + # The MLA_PA path will be used as default path in the future, `_npu_paged_attention_mla` will + # be removed after the torch_npu contains `torch_npu.atb.npu_multi_head_latent_attention` become + # public available + assert len(kv_c_and_k_pe_cache) > 1 + if envs_ascend.VLLM_ASCEND_MLA_PA: + attn_output = torch_npu.atb.npu_multi_head_latent_attention( + q_nope, q_pe, kv_c_and_k_pe_cache[0], + kv_c_and_k_pe_cache[1], attn_metadata.decode.block_table, + attn_metadata.decode.seq_lens, self.num_heads, self.scale, + self.num_kv_heads) + else: + q = torch.cat([q_nope, q_pe], dim=-1) + attn_output = torch.empty( + [num_tokens, self.num_heads, self.kv_lora_rank], + dtype=q.dtype, + device=q.device) + k_cache = torch.cat( + [kv_c_and_k_pe_cache[0], kv_c_and_k_pe_cache[1]], dim=-1) + torch_npu._npu_paged_attention_mla( + query=q, + key_cache=k_cache, + num_kv_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale_value=self.scale, + block_table=attn_metadata.decode. + block_table, # type:ignore + context_lens=attn_metadata.decode.seq_lens, # type:ignore + mla_vheadsize=self.kv_lora_rank, + out=attn_output) + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is None: + return self._v_up_proj_and_o_proj(attn_output, + enable_multistream_mla) + else: + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + return self._v_up_proj_and_o_proj(attn_output) + + def forward( + self, + layer: AttentionLayer, + hidden_states_or_q_c: torch.Tensor, # query in unified attn + hidden_states_or_kv_c_normed: torch.Tensor, # key in unified attn + k_pe: torch.Tensor, # value in unified attn + kv_cache: Tuple[torch.Tensor], + attn_metadata: M, + output: Optional[torch.Tensor] = None, + enable_multistream_mla: bool = False, + ckq: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + assert output is not None, "Output tensor must be provided." + if attn_metadata is None: + # Profiling run. + return output + self.running_in_graph = self.torchair_graph_enabled and attn_metadata.attn_state in [ + AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding + ] + self.running_chunkprefilll_with_torchair = self.torchair_graph_enabled and attn_metadata.attn_state == AscendAttentionState.ChunkedPrefill + num_actual_toks = attn_metadata.num_actual_tokens + if k_pe is None and not self.running_in_graph: + kv_c, k_pe = self.kv_a_proj_with_mqa( + hidden_states_or_kv_c_normed)[0].split( + [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1) + kv_c_normed = self.kv_a_layernorm(kv_c.contiguous()) + else: + kv_c_normed = hidden_states_or_kv_c_normed + assert attn_metadata.num_decodes is not None and \ + attn_metadata.num_prefills is not None and \ + attn_metadata.num_decode_tokens is not None + has_decode = attn_metadata.num_decodes > 0 + has_prefill = attn_metadata.num_prefills > 0 + num_decode_tokens = attn_metadata.num_decode_tokens + if not self.running_in_graph: + # Inputs and outputs may be padded for CUDA graphs + output_padded = output + output = output[:num_actual_toks, ...] + if not self.torchair_graph_enabled: + kv_c_normed = kv_c_normed[:num_actual_toks, ...] + prefill_k_c_normed = kv_c_normed[num_decode_tokens:] + if not self.running_in_graph: + hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...] + prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:] + decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens] + prefill_hs = hidden_states_or_kv_c_normed[num_decode_tokens:] + # if not self.torchair_graph_enabled: + k_pe = k_pe[:num_actual_toks, ...] + k_pe = k_pe.unsqueeze(1) + decode_k_pe = k_pe[:num_decode_tokens] + prefill_k_pe = k_pe[num_decode_tokens:] + else: + decode_hs_or_q_c = hidden_states_or_q_c + if has_decode: + decode_k_nope = None + assert attn_metadata.decode is not None + if self.running_in_graph or self.running_chunkprefilll_with_torchair: + cos = attn_metadata.decode.cos + sin = attn_metadata.decode.sin + if self.running_chunkprefilll_with_torchair: + decode_hs = ( + hidden_states_or_kv_c_normed[:num_decode_tokens]) + slots = attn_metadata.slot_mapping[:num_decode_tokens] + decode_k_pe, decode_k_nope, decode_kv = self.exec_kv( + decode_hs, cos, sin, kv_cache, slots) + else: + with npu_stream_switch("mla_secondary", + 0, + enabled=enable_multistream_mla): + npu_wait_tensor(hidden_states_or_kv_c_normed, + ckq, + enabled=enable_multistream_mla) + decode_k_pe, decode_k_nope, decode_kv = self.exec_kv( + hidden_states_or_kv_c_normed, cos, sin, kv_cache, + attn_metadata.slot_mapping) + # Without explicitly controlling the order, IndexByTensor operations + # would be placed after `matmul W_KV_T` hindering the overlapping of + # KvRmsNormRopeCache and SingleRope. + npu_wait_tensor(decode_hs_or_q_c, + cos, + enabled=enable_multistream_mla) + npu_wait_tensor(decode_hs_or_q_c, + sin, + enabled=enable_multistream_mla) + npu_wait_tensor(decode_hs_or_q_c, + decode_kv, + enabled=enable_multistream_mla) + + decode_ql_nope, decode_q_pe = \ + self._q_proj_and_k_up_proj(decode_hs_or_q_c) + if self.running_in_graph: + with npu_stream_switch("mla_secondary", + 0, + enabled=enable_multistream_mla): + npu_wait_tensor(decode_q_pe, + decode_k_pe, + enabled=enable_multistream_mla) + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + elif self.running_chunkprefilll_with_torchair: + decode_q_pe = self.rope_single(decode_q_pe, cos, sin) + else: + decode_q_pe[...], decode_k_pe[...] = self.rotary_emb( + attn_metadata.decode.input_positions, + decode_q_pe.contiguous(), + decode_k_pe, + max_seq_len=attn_metadata.decode.max_seq_lens) + if has_prefill: + assert attn_metadata.prefill is not None + prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\ + .view(-1, self.num_heads, self.qk_head_dim) + prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:] + prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim] + if self.torchair_graph_enabled: + num_tokens = prefill_hs_or_q_c.shape[0] + cos = attn_metadata.prefill.cos + sin = attn_metadata.prefill.sin + + prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin) + prefill_k_pe, prefill_k_nope = self.exec_kv_prefill( + prefill_hs, cos, sin, kv_cache, + attn_metadata.slot_mapping[num_decode_tokens:]) + + kv_c_normed = prefill_k_nope[:num_actual_toks, ...] + prefill_k_c_normed = prefill_k_nope + prefill_k_pe = prefill_k_pe.view(num_tokens, self.num_kv_heads, + -1) + prefill_q = torch.cat([prefill_q_nope, prefill_q_pe], dim=-1) + else: + prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb( + attn_metadata.prefill.input_positions, + prefill_q_pe.contiguous(), + prefill_k_pe, + max_seq_len=attn_metadata.prefill.max_seq_lens) + + assert len( + kv_cache + ) > 1, "the number of kv cache should be greater than 1, namely (nope_cache and rope_cache)" + if self.torchair_graph_enabled: + if kv_cache[0].numel() > 0 and has_prefill: + slots = attn_metadata.slot_mapping + # NOTE: Separate the kv cache in advance to avoid OOM or other issues + torch_npu._npu_reshape_and_cache( + key=kv_c_normed.view(num_tokens, self.num_kv_heads, -1), + value=prefill_k_pe, + key_cache=kv_cache[0], + value_cache=kv_cache[1], + slot_indices=slots[num_decode_tokens:]) + else: + kv_c_normed = kv_c_normed.view( + [num_actual_toks, self.num_kv_heads, -1]) + torch_npu._npu_reshape_and_cache( + key=kv_c_normed, + value=k_pe, + key_cache=kv_cache[0], + value_cache=kv_cache[1], + slot_indices=attn_metadata.slot_mapping) + if not self.running_in_graph: + o_proj_input_shape = (num_actual_toks, + self.num_heads * self.v_head_dim) + o_proj_input = torch.empty(o_proj_input_shape, + dtype=hidden_states_or_q_c.dtype, + device=hidden_states_or_q_c.device) + if has_prefill: + # FIX: aicore move should be also placed on the comm stream in dbo, + # otherwise it may affect the accuracy + # TODO: use an elegant way to overlap + output_prefill = self._forward_prefill(prefill_q, + prefill_k_c_normed, + prefill_k_pe, kv_cache, + attn_metadata) + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is not None: + current_ms_metadata.before_comm_event.record() + with torch.npu.stream(current_ms_metadata.comm_stream): + current_ms_metadata.before_comm_event.wait() + o_proj_input[num_decode_tokens:] = output_prefill + else: + o_proj_input[num_decode_tokens:] = output_prefill + + if has_decode: + if self.running_in_graph: + return self._forward_decode(decode_ql_nope, decode_q_pe, + decode_k_nope, decode_k_pe, + kv_cache, attn_metadata, + enable_multistream_mla) + else: + output_decode = self._forward_decode(decode_ql_nope, + decode_q_pe, + decode_k_nope, + decode_k_pe, kv_cache, + attn_metadata) + current_ms_metadata = get_multistream_comm_context() + if current_ms_metadata is not None: + with torch.npu.stream(current_ms_metadata.comm_stream): + o_proj_input[:num_decode_tokens] = output_decode + else: + o_proj_input[:num_decode_tokens] = output_decode + + current_ms_metadata = get_multistream_comm_context() + MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 # 16MB + if current_ms_metadata is None: + npu_prefetch(self.o_proj.weight, + o_proj_input, + max_size=MAX_O_PROJ_PREFETCH_SIZE, + enabled=enable_multistream_mla) + + output[...] = self.o_proj( + o_proj_input, + is_prefill=True, + is_force_scatter=self.enable_shared_expert_dp)[0] + else: + with torch.npu.stream(current_ms_metadata.comm_stream): + npu_prefetch(self.o_proj.weight, + o_proj_input, + max_size=MAX_O_PROJ_PREFETCH_SIZE, + enabled=enable_multistream_mla) + output[...] = self.o_proj( + o_proj_input, + is_prefill=True, + is_force_scatter=self.enable_shared_expert_dp)[0] + current_ms_metadata.after_comm_event.record() + del o_proj_input + return output_padded diff --git a/vllm_ascend/torchair/torchair_model_runner.py b/vllm_ascend/torchair/torchair_model_runner.py new file mode 100644 index 0000000..2b34f9b --- /dev/null +++ b/vllm_ascend/torchair/torchair_model_runner.py @@ -0,0 +1,446 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py +# isort: skip_file + +import types +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch_npu +import vllm.envs as envs_vllm +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.parallel_state import get_dp_group +from vllm.forward_context import get_forward_context +from vllm.logger import logger + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.platform import NPUPlatform +from vllm_ascend.torchair.utils import ( + TORCHAIR_CACHE_DIR, TorchairCommonAttentionMetadata, + check_torchair_cache_exist, converting_weight_acl_format, + register_torchair_model, torchair_ops_patch, + torchair_quant_method_register, write_kv_cache_bytes_to_file) +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, + is_310p) +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner + + +class NPUTorchairModelRunner(NPUModelRunner): + + def __init__(self, vllm_config: VllmConfig, device: torch.device): + super().__init__(vllm_config, device) + ascend_config = get_ascend_config() + self.new_kv_cache_bytes = -1 + self.torchair_compiled_model = None # type: ignore + self.torchair_compiled_models = {} # type: ignore + self.use_cached_npu_graph = ascend_config.torchair_graph_config.use_cached_graph + self.use_cached_kv_cache_bytes = ascend_config.torchair_graph_config.use_cached_kv_cache_bytes + self.torchair_graph_batch_sizes = ascend_config.torchair_graph_config.graph_batch_sizes + if ascend_config.torchair_graph_config.graph_batch_sizes_init: + self.init_torchair_graph_batch_sizes() + + self.check_torchair_graph_batch_sizes() + + torch._dynamo.cache_size.config.cache_size_limit += len( + self.torchair_graph_batch_sizes) + torch._dynamo.config.capture_dynamic_output_shape_ops = True + torch._logging.set_logs( + recompiles=envs_ascend.VLLM_ASCEND_TRACE_RECOMPILES) + + self._check_batch_sizes_consistency() + register_torchair_model() + torchair_ops_patch() + torchair_quant_method_register() + + def _sync_metadata_across_dp( + self, num_tokens: int, with_prefill: bool, enable_dbo: bool + ) -> tuple[int, Optional[torch.Tensor], bool, bool]: + """Override from NPUModelRunner to pad num_tokens""" + if self.dp_size == 1: + if not with_prefill: + maybe_padded_num_tokens = self.select_torchair_padded_batch_size( + num_tokens) + return maybe_padded_num_tokens, None, with_prefill, enable_dbo + return num_tokens, None, with_prefill, enable_dbo + + num_tokens_across_dp = torch.zeros(self.dp_size + 2, + dtype=torch.int32, + device="npu") + num_tokens_across_dp[self.dp_rank] = num_tokens + num_tokens_across_dp[-2] = int(with_prefill) + num_tokens_across_dp[-1] = int(not enable_dbo) + dist.all_reduce(num_tokens_across_dp, + group=get_dp_group().device_group) + with_prefill = bool(num_tokens_across_dp[-2]) + enable_dbo = not bool(num_tokens_across_dp[-1]) + num_tokens_across_dp = num_tokens_across_dp[:-2] + + if not with_prefill: + max_num_token = num_tokens_across_dp.max().item() + maybe_padded_num_tokens = self.select_torchair_padded_batch_size( + max_num_token) + num_tokens_across_dp = torch.full((self.dp_size, ), + maybe_padded_num_tokens, + dtype=torch.int32, + device="npu") + else: + maybe_padded_num_tokens = num_tokens + + return maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, enable_dbo + + def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn): + # NOTE: If torchair graph mode and not with_prefill, + # we can't skip_attn, it will cause graph recompile. + if not with_prefill: + common_attn_metadata = TorchairCommonAttentionMetadata( + num_reqs=num_reqs, + num_actual_tokens=1, + actual_seq_lengths_q=self.actual_seq_lengths_q, + attn_mask=self.attn_mask, + spec_attn_mask=self.spec_attn_mask, + decode_token_per_req=self.decode_token_per_req, + ) + attn_metadata = self.attn_metadata_builder.build_torchair_graph_dummy( + common_attn_metadata) + else: + attn_metadata = super()._build_attention_metadata( + with_prefill, num_reqs, skip_attn) + return attn_metadata + + def _generate_dummy_run_hidden_states(self, with_prefill, + is_torchair_compile, input_ids, + positions, attn_metadata, num_tokens, + intermediate_tensors, inputs_embeds): + + if not with_prefill: + # Only mark static while compiling + if is_torchair_compile: + torch._dynamo.mark_static(input_ids) + torch._dynamo.mark_static(positions) + torch._dynamo.mark_static(attn_metadata.decode.block_table) + torch._dynamo.mark_static(attn_metadata.decode.input_positions) + torch._dynamo.mark_static(get_forward_context().mc2_mask) + if hasattr(attn_metadata.decode, "sin"): + torch._dynamo.mark_static(attn_metadata.decode.sin) + torch._dynamo.mark_static(attn_metadata.decode.cos) + torch._dynamo.mark_static(attn_metadata.slot_mapping) + if self.speculative_config: + torch._dynamo.mark_static(attn_metadata.decode.attn_mask) + for kv in self.kv_caches: + assert isinstance(kv, tuple), "kv_cache must be a tuple" + torch._dynamo.mark_static(kv[0]) + torch._dynamo.mark_static(kv[1]) + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) + + compiled_model = self._get_torchair_lazy_compiled_model(num_tokens) + model_kwargs = {} + model_kwargs["kv_caches"] = self.kv_caches + model_kwargs["attn_metadata"] = attn_metadata + hidden_states = compiled_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=None, + **model_kwargs, + ) + else: + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) + hidden_states = super()._generate_dummy_run_hidden_states( + with_prefill, is_torchair_compile, input_ids, positions, + attn_metadata, num_tokens, intermediate_tensors, inputs_embeds) + return hidden_states + + def _convert_torch_format(self, kv_cache): + kv_cache = torch_npu.npu_format_cast(kv_cache, ACL_FORMAT_FRACTAL_ND) + return kv_cache + + def _compile_torchair_graph(self, torchair_graph_batch_sizes) -> None: + # Trigger torchair graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + for idx, num_tokens in enumerate(reversed(torchair_graph_batch_sizes)): + for _ in range(self.vllm_config.compilation_config. + cudagraph_num_of_warmups): + self._dummy_run(num_tokens, is_torchair_compile=True) + self._dummy_run(num_tokens, is_torchair_compile=True) + logger.info("Batchsize %d is compiled successfully: %d/%d.", + num_tokens, idx + 1, len(torchair_graph_batch_sizes)) + + def _capture_model(self): + """Override from NPUModelRunner to use torchair graph capture.""" + # TODO(NeverRaR): Calling graph_capture(device=self.device) in + # torchair graph capture can cause some issues, so now we just + # temporarily split the codepath for the two different graph patterns. + torchair_graph_batch_sizes = self.torchair_graph_batch_sizes + graph_num = len(torchair_graph_batch_sizes) + + if self.use_cached_npu_graph and not check_torchair_cache_exist(): + # If caching is enabled but does not exist (either + # use_cached_kv_cache_bytes is disabled or kv_cache_bytes are + # different), we will compile the model twice. The first time is + # used to generate the cache, and the second time is used to load the + # cache to skip the overhead caused by Dynamo guard mechanism. + logger.info( + "Cache compilation for torchair graph is enabled. Now we compile graph to genetate" + " torchair cache, this usually takes %.1f~%.1f mins.", + 0.5 * graph_num, 1.5 * graph_num) + self._compile_torchair_graph(torchair_graph_batch_sizes) + NPUPlatform.synchronize() + # Note: We reset dynamo and reload the compiled torchair cached computation graph below + # that was compiled above. This operation reduces graph launch time by 2-4ms and avoids + # runtime errors caused by configuration mismatches in graph mode. + torch._dynamo.reset() + self.torchair_compiled_models.clear() + if self.use_cached_npu_graph: + logger.info( + "Loading torchair graph cache, this usually takes %.1f~%.1f mins.", + 0.3 * graph_num, 0.5 * graph_num) + self._compile_torchair_graph(torchair_graph_batch_sizes) + else: + logger.info( + "Capturing torchair graph, this usually takes %.1f~%.1f mins.", + 0.5 * graph_num, 1.5 * graph_num) + self._compile_torchair_graph(torchair_graph_batch_sizes) + + if self.use_cached_kv_cache_bytes and self.new_kv_cache_bytes > 0: + write_kv_cache_bytes_to_file(torch.distributed.get_rank(), + self.new_kv_cache_bytes) + + def _use_aclgraph(self) -> bool: + return False + + def _check_batch_sizes_consistency(self) -> None: + if not dist.is_initialized(): + return + + local = torch.tensor(self.torchair_graph_batch_sizes, + device="cpu", + dtype=torch.int32) + gathered_graph_batch_size = local.clone() + dist.all_reduce(gathered_graph_batch_size, + group=get_dp_group().cpu_group) + expected = local * self.dp_size + + if not torch.equal(gathered_graph_batch_size, expected): + diff_idxs = (gathered_graph_batch_size != expected).nonzero( + as_tuple=False).flatten().tolist() + raise AssertionError( + f"[Graph BatchSize Mismatch] Found mismatches at indices {diff_idxs}.\n" + f"Local (rank {self.dp_rank}): {local.tolist()}\n" + f"Sum over ranks: {gathered_graph_batch_size.tolist()}\n" + f"Expected if all equal: {[v * self.dp_size for v in local.tolist()]}" + ) + + def _update_graph_pad_size(self, with_prefill, graph_pad_size): + if not with_prefill: + self.graph_pad_size = graph_pad_size + else: + super()._update_graph_pad_size(with_prefill, graph_pad_size) + + def _update_input_ids_and_positions(self, input_ids, positions, + num_input_tokens, with_prefill, + padded_num_tokens_across_dp): + """Override from NPUModelRunner to update input_ids and positions""" + input_ids, positions = super()._update_input_ids_and_positions( + input_ids, positions, num_input_tokens, with_prefill, + padded_num_tokens_across_dp) + + if not with_prefill: + input_ids = self.input_ids[:padded_num_tokens_across_dp] + positions = self.positions[:padded_num_tokens_across_dp] + return input_ids, positions + + def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill, + padded_num_tokens_across_dp, + input_ids, positions, + intermediate_tensors, + inputs_embeds): + model_kwargs = { + "kv_caches": self.kv_caches, + "attn_metadata": attn_metadata + } + if not with_prefill: + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_NZ) + compiled_model = self._get_torchair_lazy_compiled_model( + padded_num_tokens_across_dp) + hidden_states = compiled_model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) + else: + assert self.model is not None + if is_310p(): + converting_weight_acl_format(self.model, ACL_FORMAT_FRACTAL_ND) + + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **model_kwargs, + ) + return hidden_states + + def _get_torchair_lazy_compiled_model(self, batch_size: int): + if batch_size < 0 or batch_size > self.torchair_graph_batch_sizes[-1]: + raise ValueError( + f"Bad graph batch size:{batch_size}! max_graph_batch_sizes:{self.torchair_graph_batch_sizes[-1]}" + ) + + compiled_model = self.torchair_compiled_models.get( + batch_size + ) if self.use_cached_npu_graph else self.torchair_compiled_model + + if compiled_model: + return compiled_model + + import torchair # type: ignore + from torchair import patch_for_hcom # type: ignore + + patch_for_hcom() + + if is_310p(): + # on 300I Duo platform, we need to patch broadcast. however, this patch will be + # overwritten by patch_for_hcom in torchair. so we need to re-patch it here. + from vllm_ascend.patch.platform.patch_common.patch_distributed import \ + communication_adaptation_310p + communication_adaptation_310p() + + config = torchair.CompilerConfig() + if get_ascend_config().torchair_graph_config.mode: + config.mode = get_ascend_config().torchair_graph_config.mode + config.experimental_config.frozen_parameter = True + # enabling tiling_schedule_optimize on 300I Duo has some bugs, so we have to + # disable it on 300I Duo platform now. + config.experimental_config.tiling_schedule_optimize = not is_310p() + config.experimental_config.enable_view_optimize = \ + get_ascend_config().torchair_graph_config.enable_view_optimize + torch.npu.set_compile_mode(jit_compile=False) + if not self.use_cached_npu_graph: + npu_backend = torchair.get_npu_backend(compiler_config=config) + self.torchair_compiled_model = torch.compile( + self.model, + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=npu_backend) + return self.torchair_compiled_model + else: + # Generate a new forward proxy code object to prevent the invalidation of + # compilation cache caused by dynamo retracing + forward_proxy_name = f"{self.model.__class__.__name__}_forward_with_batch_size_{batch_size}" + forward_fn = self.model.forward + code = forward_fn.__code__ + # Mark code object with a new proxy name + modified_code = code.replace(co_name=forward_proxy_name, ) + + modified_func = types.FunctionType(modified_code, + forward_fn.__globals__, + name=forward_proxy_name, + argdefs=forward_fn.__defaults__) + + self.model.__dict__[forward_proxy_name] = modified_func.__get__( + self.model, nn.Module) + self.torchair_compiled_models[ + batch_size] = torchair.inference.cache_compile( + self.model.__dict__[forward_proxy_name], + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + cache_dir=TORCHAIR_CACHE_DIR, + config=config, + ge_cache=False) + return self.torchair_compiled_models[batch_size] + + def init_torchair_graph_batch_sizes(self): + start_graph_batch_size = 4 + tp_size = get_tensor_model_parallel_world_size() + + # NOTE: When use all2all | mc2, We need to slice the `num_tokens` dimension into `tp_size` blocks + start_graph_batch_size = max(start_graph_batch_size, tp_size) + + while (start_graph_batch_size <= self.max_num_reqs): + self.torchair_graph_batch_sizes.append(start_graph_batch_size) + start_graph_batch_size *= 2 + + def select_torchair_padded_batch_size(self, batch_size: int): + for padded_batch_size in self.torchair_graph_batch_sizes: + if batch_size <= padded_batch_size: + # we treat batch_size as num of requests + return padded_batch_size + raise ValueError( + f"cur batch_size is invalid, torchair_graph_batch_sizes is " + f"{self.torchair_graph_batch_sizes}, but cur batch_size is {batch_size}." + ) + + def check_torchair_graph_batch_sizes(self): + # return graph_batch_sizes according to the max number of tokens + # first pad according to the number of requests + if len(self.torchair_graph_batch_sizes) == 0: + self.torchair_graph_batch_sizes = [1, self.max_num_reqs] + else: + self.torchair_graph_batch_sizes = sorted( + self.torchair_graph_batch_sizes) + while self.torchair_graph_batch_sizes[-1] > self.max_num_reqs: + self.torchair_graph_batch_sizes.pop() + if len(self.torchair_graph_batch_sizes) == 0: + logger.warning( + "torch_graph_batch_sizes is invalid, reset it to [1, max_num_seqs]" + ) + self.torchair_graph_batch_sizes = [1, self.max_num_reqs] + if self.torchair_graph_batch_sizes[-1] < self.max_num_reqs: + self.torchair_graph_batch_sizes.append(self.max_num_reqs) + + # padded max number tokens = max_num_req * decode_token_per_req + self.torchair_graph_batch_sizes = [ + graph_batch_size * self.decode_token_per_req + for graph_batch_size in self.torchair_graph_batch_sizes + ] + + # NOTE: when enable_expert_parallel, we need to check if `graph_batch_size` is divisible by `tp_size` + tp_size = self.parallel_config.tensor_parallel_size + if self.parallel_config.enable_expert_parallel: + new_graph_batch_sizes = [] + for graph_batch_size in self.torchair_graph_batch_sizes: + cur_graph_batch_size = (graph_batch_size + tp_size - + 1) // tp_size * tp_size + if cur_graph_batch_size not in new_graph_batch_sizes and \ + cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens: + new_graph_batch_sizes.append(cur_graph_batch_size) + elif cur_graph_batch_size > self.scheduler_config.max_num_batched_tokens \ + and self.decode_token_per_req > 1: + logger.warning( + f"torchair_graph_batch_sizes {cur_graph_batch_size} is bigger than max_num_batched_tokens", + f"{self.scheduler_config.max_num_batched_tokens} will skip this batch size." + ) + self.torchair_graph_batch_sizes = new_graph_batch_sizes + + def _build_drafter_prepare_inputs_torchair_param(self): + return True + + def get_dp_padding(self, num_tokens): + """Override from NPUModelRunner to get dp padding""" + return 0, None diff --git a/vllm_ascend/torchair/torchair_worker.py b/vllm_ascend/torchair/torchair_worker.py new file mode 100644 index 0000000..85f2fb4 --- /dev/null +++ b/vllm_ascend/torchair/torchair_worker.py @@ -0,0 +1,63 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from vllm.logger import logger + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.torchair.torchair_model_runner import NPUTorchairModelRunner +from vllm_ascend.torchair.utils import (check_kv_cache_bytes_cache_exist, + delete_torchair_cache_file, + read_kv_cache_bytes_from_file) +from vllm_ascend.worker.worker_v1 import NPUWorker + + +class NPUTorchairWorker(NPUWorker): + """Torchair worker bases on NPUWorker. Only torchair specified code should be added in this class.""" + + def determine_available_memory(self) -> int: + """Override determine_available_memory to use cached torchair kv_cache_bytes.""" + + available_kv_cache_memory = super().determine_available_memory() + + if get_ascend_config( + ).torchair_graph_config.use_cached_kv_cache_bytes and check_kv_cache_bytes_cache_exist( + ): + old_kv_cache_bytes = read_kv_cache_bytes_from_file( + torch.distributed.get_rank()) + if 0 < old_kv_cache_bytes <= available_kv_cache_memory: + logger.info( + f"Use cached torchair kv_cache_bytes: {old_kv_cache_bytes}" + ) + self.model_runner.new_kv_cache_bytes = old_kv_cache_bytes + return old_kv_cache_bytes + else: + logger.info( + "Cached torchair kv_cache_bytes is too big, invalidate old torchair_cache" + ) + delete_torchair_cache_file() + bytes_floating_tolerance = 1024 * 1024 * envs_ascend.VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE + available_kv_cache_memory -= bytes_floating_tolerance + logger.info(f"Use new kv_cache_bytes: {available_kv_cache_memory}") + self.model_runner.new_kv_cache_bytes = available_kv_cache_memory + + return available_kv_cache_memory + + def init_device(self): + """Override init_device to init torchair model runner""" + device = self._init_device() + # Init ModelRunner here, so that we have access to self.device. + self.model_runner = NPUTorchairModelRunner(self.vllm_config, device) diff --git a/vllm_ascend/torchair/utils.py b/vllm_ascend/torchair/utils.py new file mode 100644 index 0000000..13d5879 --- /dev/null +++ b/vllm_ascend/torchair/utils.py @@ -0,0 +1,205 @@ +import fcntl +import os +import shutil +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass + +import torch +import torch_npu + +try: + # Recent release of torchair has moved these ops to `.scope`. + from torchair.scope import npu_stream_switch as _npu_stream_switch + from torchair.scope import npu_wait_tensor as _npu_wait_tensor +except ImportError: + from torchair.ops import NpuStreamSwitch as _npu_stream_switch + from torchair.ops import npu_wait_tensor as _npu_wait_tensor + +KV_CACHE_BYTES_CACHE_PATH_NAME = ".kv_cache_bytes" +KV_CACHE_BYTES_CACHE_FILE_NAME = "kv_cache_bytes" +TORCHAIR_CACHE_PATH_NAME = ".torchair_cache" +TORCHAIR_CACHE_DIR = os.path.join( + os.getenv('TORCHAIR_CACHE_HOME', os.getcwd()), TORCHAIR_CACHE_PATH_NAME) + + +@dataclass +class TorchairCommonAttentionMetadata: + """ + Per-batch attention metadata, shared across layers and backends. + AttentionMetadataBuilder instances use it to construct per-layer metadata. + + For many of the tensors we keep both GPU and CPU versions. + """ + + num_reqs: int + """Number of requests""" + + num_actual_tokens: int + """Total number of tokens in batch""" + + decode_token_per_req: int + + actual_seq_lengths_q: list[int] + + attn_mask: torch.Tensor = None + + spec_attn_mask: torch.Tensor = None + + graph_pad_size: int = -1 + + +@contextmanager +def _file_lock(file_descriptor, lock_type): + fcntl.flock(file_descriptor, lock_type) + try: + yield + finally: + fcntl.flock(file_descriptor, fcntl.LOCK_UN) + + +def _get_torchair_current_work_dir(file_name=None): + if file_name is None: + return TORCHAIR_CACHE_DIR + return os.path.join(TORCHAIR_CACHE_DIR, file_name) + + +def check_torchair_cache_exist(): + res = False + torch_air_abs_path = _get_torchair_current_work_dir() + if os.path.exists(torch_air_abs_path): + file_list = os.listdir(torch_air_abs_path) + if len(file_list) != 0: + res = True + return res + + +def check_kv_cache_bytes_cache_exist(): + res = False + kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir( + KV_CACHE_BYTES_CACHE_PATH_NAME) + if os.path.exists(kv_cache_bytes_cache_abs_path): + file_list = os.listdir(kv_cache_bytes_cache_abs_path) + if len(file_list) != 0: + res = True + return res + + +def read_kv_cache_bytes_from_file(rank) -> int: + kv_cache_bytes = -1 + kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir( + KV_CACHE_BYTES_CACHE_PATH_NAME) + kv_cache_bytes_file = os.path.join( + kv_cache_bytes_cache_abs_path, + f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}") + with open(kv_cache_bytes_file, "r", encoding="utf-8") as f: + with _file_lock(f, fcntl.LOCK_SH): + kv_cache_bytes = int(f.readline()) + return kv_cache_bytes + + +def write_kv_cache_bytes_to_file(rank, kv_cache_bytes): + kv_cache_bytes_cache_abs_path = _get_torchair_current_work_dir( + KV_CACHE_BYTES_CACHE_PATH_NAME) + os.makedirs(kv_cache_bytes_cache_abs_path, exist_ok=True) + kv_cache_bytes_file = os.path.join( + kv_cache_bytes_cache_abs_path, + f"{rank}_{KV_CACHE_BYTES_CACHE_FILE_NAME}") + with open(kv_cache_bytes_file, "w", encoding="utf-8") as f: + with _file_lock(f, fcntl.LOCK_EX): + f.write(f"{kv_cache_bytes}") + + +def delete_torchair_cache_file(): + torch_air_abs_path = _get_torchair_current_work_dir() + try: + shutil.rmtree(torch_air_abs_path) + except FileNotFoundError: + pass + + +def npu_stream_switch(tag: str, priority: int, *, enabled: bool = True): + return _npu_stream_switch(tag, priority) if enabled else nullcontext() + + +def npu_wait_tensor(self: torch.Tensor, + dependency: torch.Tensor, + *, + enabled: bool = True): + return _npu_wait_tensor(self, dependency) if enabled else self + + +def converting_weight_acl_format(model, format): + # currently, there are some operations which do not support ACL_FORMAT_FRACTAL_NZ + # in eager mode but support it in torchair graph mode. since ACL_FORMAT_FRACTAL_NZ + # is much more preferred than ACL_FORMAT_FRACTAL_ND on 300I Duo, we add this + # conversion when using torchair graph mode on 300I Duo platform. + # TODO: we will remove this conversion if npu_quant_grouped_matmul_dequant + # accepts weight format of ACL_FORMAT_FRACTAL_NZ in eager mode. + from vllm.model_executor.layers.fused_moe.layer import FusedMoE + + for module in model.modules(): + if isinstance(module, FusedMoE): + if torch_npu.get_npu_format(module.w13_weight.data) == format: + return + module.w13_weight.data = torch_npu.npu_format_cast( + module.w13_weight.data, format) + module.w2_weight.data = torch_npu.npu_format_cast( + module.w2_weight.data, format) + + +def register_torchair_model(): + from vllm import ModelRegistry + + ModelRegistry.register_model( + "DeepSeekMTPModel", + "vllm_ascend.torchair.models.torchair_deepseek_mtp:TorchairDeepSeekMTP" + ) + + ModelRegistry.register_model( + "DeepseekV2ForCausalLM", + "vllm_ascend.torchair.models.torchair_deepseek_v2:TorchairDeepseekV2ForCausalLM" + ) + + ModelRegistry.register_model( + "DeepseekV3ForCausalLM", + "vllm_ascend.torchair.models.torchair_deepseek_v3:TorchairDeepseekV3ForCausalLM" + ) + + ModelRegistry.register_model( + "Qwen2ForCausalLM", + "vllm_ascend.torchair.models.qwen2:CustomQwen2ForCausalLM") + + ModelRegistry.register_model( + "Qwen3MoeForCausalLM", + "vllm_ascend.torchair.models.qwen3_moe:CustomQwen3MoeForCausalLM") + + ModelRegistry.register_model( + "PanguProMoEForCausalLM", + "vllm_ascend.torchair.models.torchair_pangu_moe:PanguProMoEForCausalLM" + ) + + +def torchair_quant_method_register(): + from vllm_ascend.quantization.quantizer import \ + SUPPORT_ASCEND_QUANTIZER_TYPE + from vllm_ascend.torchair.quantization.torchair_quantizer import ( + TorchairW4A8DYNAMICQuantizer, TorchairW8A8DYNAMICQuantizer) + + SUPPORT_ASCEND_QUANTIZER_TYPE[ + "W8A8_DYNAMIC"] = TorchairW8A8DYNAMICQuantizer + SUPPORT_ASCEND_QUANTIZER_TYPE[ + "W4A8_DYNAMIC"] = TorchairW4A8DYNAMICQuantizer + + +def torchair_ops_patch(): + from vllm_ascend.ops.rotary_embedding import ( + AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding) + from vllm_ascend.torchair.ops.torchair_rotary_embedding import ( + deepseek_rope_init_func, native_rope_deepseek_forward, + qwen_rope_init_func, rope_forward) + + AscendRotaryEmbedding.__init__ = qwen_rope_init_func # type: ignore[method-assign] + AscendRotaryEmbedding.forward_oot = rope_forward # type: ignore[method-assign] + + AscendDeepseekScalingRotaryEmbedding.__init__ = deepseek_rope_init_func # type: ignore[method-assign] + AscendDeepseekScalingRotaryEmbedding.forward = native_rope_deepseek_forward # type: ignore[method-assign] diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py new file mode 100644 index 0000000..adab490 --- /dev/null +++ b/vllm_ascend/utils.py @@ -0,0 +1,558 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/worker.py +# + +import atexit +import functools +import math +import os +from contextlib import contextmanager +from enum import Enum +from threading import Lock +from typing import TYPE_CHECKING, List, Tuple + +import torch +import torch_npu # noqa: F401 # noqa: F401 +from packaging.version import InvalidVersion, Version +from torch_npu.npu.streams import Event +from vllm.logger import logger + +import vllm_ascend.envs as envs_ascend +from vllm_ascend.ascend_config import get_ascend_config + +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +# NOTE: Currently, we can only capture 1920 graphs at most, +# due to the limitation of ACL graph. This number is bounded by +# the number of streams, which is 2048, we save 128 streams +# as a buffer. +# Maximum number of graphs that can be captured by ACL Graph +MAX_CAPTURE_SIZE = 1920 + +ASCEND_QUANTIZATION_METHOD = "ascend" +SOC_VERSION_INFERENCE_SERIES = ["Ascend310P3"] + +ACL_FORMAT_FRACTAL_ND = 2 +ACL_FORMAT_FRACTAL_NZ = 29 + +_CUSTOM_OP_ENABLED = None +_IS_310P = None +_SLEEP_MODE_ENABLED = None +_CURRENT_STREAM = None +_ASCEND_CUSTOMOP_IS_REIGISTERED = False + + +def is_310p(): + global _IS_310P + if _IS_310P is None: + from vllm_ascend import _build_info # type: ignore + _IS_310P = _build_info.__soc_version__.lower().startswith("ascend310p") + return _IS_310P + + +def sleep_mode_enabled(): + global _SLEEP_MODE_ENABLED + if _SLEEP_MODE_ENABLED is None: + from vllm_ascend import _build_info # type: ignore + _SLEEP_MODE_ENABLED = _build_info.__sleep_mode_enabled__ + return _SLEEP_MODE_ENABLED + + +def _round_up(x: int, align: int): + # round up x to align, for example, if align is 16, x will be rounded up to 16, 32, 48, etc. + # input: 15, 16 -> output: 16 + # input: 17, 16 -> output: 32 + # input: 30, 16 -> output: 32 + # input: 33, 16 -> output: 48 + # ... + return (x + align - 1) // align * align + + +def _custom_pad(x, pad_dims): + # pad the input tensor to the shape of pad_dims + # input: (13, 30), pad_dims: [0, 2, 0, 3] + # output: (16, 32) + return torch.nn.functional.pad(x, pad_dims) + + +def _custom_reshape(x, target_shape): + # reshape the input tensor to the shape of target_shape + # input: (16, 32), target_shape: [1, 16, 2, 16] + # output: (1, 16, 2, 16) + return x.reshape(target_shape) + + +def _custom_transpose(x, dim1, dim2): + # transpose the input tensor + # input: (1, 16, 2, 16), dim1: 1, dim2: 2 + # output: (1, 2, 16, 16) + return x.transpose(dim1, dim2) + + +def nd_to_nz_2d(in_tensor: torch.Tensor) -> torch.Tensor: + # in_tensor: (13, 30) + aux_dims = [1, 0, 0, 16] + # aux_dims[1]: 16 + aux_dims[1] = _round_up(in_tensor.size(0), 16) + # aux_dims[2]: 2 + aux_dims[2] = _round_up(in_tensor.size(1), 16) // 16 + + # after: aux_dims: [1, 16, 2, 16] + + pad_dims = [0, 0, 0, 0] + # pad_dims[1]: 2 + pad_dims[1] = _round_up(in_tensor.size(1), 16) - in_tensor.size(1) + # pad_dims[3]: 3 + pad_dims[3] = _round_up(in_tensor.size(0), 16) - in_tensor.size(0) + + # after: pad_dims: [0, 2, 0, 3] + + # return: (1, 2, 16, 16) + return _custom_transpose( + _custom_reshape(_custom_pad(in_tensor, pad_dims), aux_dims), 1, + 2).contiguous() + + +def nd_to_nz_spec(mask_tensor: torch.Tensor) -> torch.Tensor: + num_tokens = mask_tensor.shape[0] + max_seq_len = mask_tensor.shape[1] + + tokens_pad = (num_tokens + 15) // 16 * 16 + max_seq_len_pad = (max_seq_len + 15) // 16 * 16 + + mask_tensor_pad = \ + torch.zeros((1, tokens_pad, max_seq_len_pad), dtype=mask_tensor.dtype, device=mask_tensor.device) + mask_tensor_pad[0][:num_tokens, :max_seq_len] = mask_tensor + mask = mask_tensor_pad.reshape( + (1, tokens_pad, max_seq_len_pad // 16, 16)).permute(0, 2, 1, 3) + return mask + + +def aligned_16(tensor: torch.Tensor): + """Aligned tensor for 310P""" + + # Get the size of the current 0th dimension + n = tensor.size(0) + + # Calculate the aligned size + n_aligned = ((n + 15) // 16) * 16 + + # If already aligned, return the original tensor + if n == n_aligned: + return tensor + + # Create a new tensor with shape (n_aligned, H, W) and fill it with zeros + new_tensor = torch.zeros(n_aligned, + *tensor.shape[1:], + dtype=tensor.dtype, + device=tensor.device) + + # Copy the original tensor to the first N positions of the new tensor + new_tensor[:n] = tensor + + return new_tensor + + +def try_register_lib(lib_name: str, lib_info: str = ""): + import importlib + import importlib.util + try: + module_spec = importlib.util.find_spec(lib_name) + if module_spec is not None: + importlib.import_module(lib_name) + if lib_info: + logger.info(lib_info) + except Exception: + pass + + +def enable_custom_op(): + """ + Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. + Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). + """ + global _CUSTOM_OP_ENABLED + if _CUSTOM_OP_ENABLED is not None: + return _CUSTOM_OP_ENABLED + try: + # isort: off + # register custom ops into torch_library here + import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 + # register the meta implementation for custom kernel if necessary + import vllm_ascend.meta_registration # type: ignore # noqa: F401 + # isort: on + _CUSTOM_OP_ENABLED = True + except ImportError: + _CUSTOM_OP_ENABLED = False + logger.warning( + "Warning: Failed to register custom ops, all custom ops will be disabled" + ) + return _CUSTOM_OP_ENABLED + + +def find_hccl_library() -> str: + """ + We either use the library file specified by the `HCCL_SO_PATH` + environment variable, or we find the library file brought by PyTorch. + After importing `torch`, `libhccl.so` can be + found by `ctypes` automatically. + """ + so_file = envs_ascend.HCCL_SO_PATH + + # manually load the hccl library + if so_file: + logger.info("Found hccl from environment variable HCCL_SO_PATH=%s", + so_file) + else: + if torch.version.cann is not None: + so_file = "libhccl.so" + else: + raise ValueError("HCCL only supports Ascend NPU backends.") + logger.info("Found hccl from library %s", so_file) + return so_file + + +def current_stream() -> torch.npu.Stream: + """ + replace `torch.npu.current_stream()` with `vllm.utils.current_stream()`. + it turns out that `torch.npu.current_stream()` is quite expensive, + as it will construct a new stream object at each call. + here we patch `torch.npu.set_stream` to keep track of the current stream + directly, so that we can avoid calling `torch.npu.current_stream()`. + + """ + global _CURRENT_STREAM + if _CURRENT_STREAM is None: + # when this function is called before any stream is set, + # we return the default stream. + _CURRENT_STREAM = torch.npu.current_stream() + return _CURRENT_STREAM + + +def adapt_patch(is_global_patch: bool = False): + if is_global_patch: + from vllm_ascend.patch import platform # noqa: F401 + else: + from vllm_ascend.patch import worker # noqa: F401 + + +@functools.cache +def vllm_version_is(target_vllm_version: str): + if envs_ascend.VLLM_VERSION is not None: + vllm_version = envs_ascend.VLLM_VERSION + else: + import vllm + vllm_version = vllm.__version__ + try: + return Version(vllm_version) == Version(target_vllm_version) + except InvalidVersion: + raise ValueError( + f"Invalid vllm version {vllm_version} found. A dev version of vllm " + "is installed probably. Set the environment variable VLLM_VERSION " + "to control it by hand. And please make sure the value follows the " + "format of x.y.z.") + + +def get_max_hidden_layers(hf_config) -> int: + cfg_dict = hf_config.to_dict() + layer_counts = [] + + def _rec_find(d): + if isinstance(d, dict): + for k, v in d.items(): + if k == "num_hidden_layers" and isinstance(v, int): + layer_counts.append(v) + else: + _rec_find(v) + + _rec_find(cfg_dict) + if not layer_counts: + raise ValueError("Not found num_hidden_layers in model config.") + return max(layer_counts) + + +def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: + """Update ACL graph capture sizes based on hardware limitations""" + # Store original configuration and temporarily clear it + compilation_config = vllm_config.compilation_config + original_sizes, compilation_config.cudagraph_capture_sizes = \ + compilation_config.cudagraph_capture_sizes, None + + # Calculate parallel configuration factor + hf_config = vllm_config.model_config.hf_config + if hasattr(hf_config, 'num_hidden_layers'): + num_hidden_layers = hf_config.num_hidden_layers + else: + num_hidden_layers = get_max_hidden_layers(hf_config) + parallel_config = vllm_config.parallel_config + + # TODO: Find out whether we need to take into account the pp_size + num_comm_groups = sum(size > 1 for size in [ + parallel_config.data_parallel_size, + parallel_config.tensor_parallel_size, + ]) + + if os.getenv("HCCL_OP_EXPANSION_MODE") == 'AIV': + # TODO: Find out whether we need to take into account the pp_size + parallel_factor = 1 + num_comm_groups + int( + parallel_config.enable_expert_parallel) + # Calculate maximum supported batch sizes considering model architecture on the A2 Hardware Device + # Assume the following case: + # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4, + # According to the formula, max_num_batch_sizes = math.floor(1920 / (48 + 1) / 2) = 19 + max_num_batch_sizes = math.floor( + MAX_CAPTURE_SIZE / (num_hidden_layers + 1) / parallel_factor) + logger.info( + "Calculated maximum supported batch sizes for ACL graph: %s", + max_num_batch_sizes) + else: + # The above describes an empirical formula applicable to the A2 hardware. + # Under this configuration, HCCL employs the FFTS+ method for execution unfolding, + # which adds only 1 concurrent stream without consuming collective communication execution unfolding streams. + # On A3 hardware, HCCL defaults to the AICPU method. + # This approach may additionally allocate up to rank_size (max 16) - 1 streams per collective communication domain on the device (worst case). + # Using the default collective communication unfolding method on A3 will lead to a significant reduction in the maximum supported sizes. + # Therefore, the calculation formula has been modified as follows: + # Assume the following case: + # MAX_CAPTURE_SIZE = 1920, num_hidden_layers = 48, data_parallel_size is 1, tensor_parallel_size is 4, + # According to the formula, max_num_batch_sizes = math.floor((1920 - 1 * 40) / (48 + 1) / (1 + 1 * 2)) = 12 + max_num_batch_sizes = math.floor( + (MAX_CAPTURE_SIZE - num_comm_groups * 40) / + (num_hidden_layers + 1) / (1 + num_comm_groups * 2)) + logger.info( + "Calculated maximum supported batch sizes for ACL graph: %s", + max_num_batch_sizes) + logger.warning( + "Currently, communication is performed using FFTS+ method, which reduces " + "the number of available streams and, as a result, limits the range of runtime " + "shapes that can be handled. To both improve communication performance and " + "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV." + ) + + # If original sizes exceed maximum, sample a representative subset + if max_num_batch_sizes < len(original_sizes): + # Sample uniformly from original sizes + step = (len(original_sizes) - 1) / (max_num_batch_sizes - 1) + indices = [round(i * step) for i in range(max_num_batch_sizes)] + + # Ensure first and last elements are preserved + indices[0], indices[-1] = 0, len(original_sizes) - 1 + + sampled_sizes = [original_sizes[i] for i in indices] + compilation_config.init_with_cudagraph_sizes(sampled_sizes) + + logger.info( + "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", + vllm_config.model_config.architectures[0], + num_hidden_layers, + len(original_sizes), + len(compilation_config. + cudagraph_capture_sizes # type: ignore[arg-type] + )) + else: + # No adjustment needed + compilation_config.cudagraph_capture_sizes = original_sizes + logger.info( + "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes", + vllm_config.model_config.architectures[0], num_hidden_layers, + len(original_sizes)) + + +# TODO(wxy): Move to ops module +def dispose_tensor(x: torch.Tensor): + x.set_(torch.empty((0, ), device=x.device, dtype=x.dtype)) + + +class ProfileExecuteDuration: + _instance = None + _observations: List[Tuple[str, Event, Event]] = [] + _lock = Lock() + + def __new__(cls): + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + atexit.register(cls._instance.destroy) + return cls._instance + + def destroy(self): + with self._lock: + self._observations.clear() + + @contextmanager + def capture_async(self, duration_tag: str): + if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE: + yield + return + + observe_start = Event(enable_timing=True) + observe_start.record() + try: + yield + finally: + observe_end = Event(enable_timing=True) + observe_end.record() + with self._lock: + self._observations.append( + (duration_tag, observe_start, observe_end)) + + def pop_captured_sync(self) -> dict: + """Pop and synchronize all events in the observation list""" + durations: dict[str, float] = {} + if not envs_ascend.VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE: + return durations + + while self._observations: + with self._lock: + tag, observe_start, observe_end = self._observations.pop() + observe_end.synchronize() + durations[tag] = observe_start.elapsed_time(observe_end) + + return durations + + +# TODO(wxy): Move to ops module +def npu_prefetch(input: torch.Tensor, + dependency: torch.Tensor, + max_size: int = 0, + *, + enabled: bool = True): + if not enabled: + return + input_size = input.element_size() * input.numel() + if max_size <= 0 or max_size > input_size: + max_size = input_size + torch_npu.npu_prefetch(input, dependency, max_size) + + +# TODO(ttanzhiqiang): rm_router_logits +# dp>1 will trigger +# In theory, this solution is only applicable to AllGather and AllGatherEP, because in the dp scenario, the previous operation was gate + two communications, and now it is changed to one communication + gate operation, which can save some communication time. In theory, all moe AllGather and AllGatherEP solutions can follow this logic, but now other moe models (qwen3-235b) dp solutions are not adjusted, so use the switch to control it to prevent code errors. +def get_rm_router_logits_state(ep_size: int, dp_size: int, + is_deepseek_v3_r1: bool): + # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep + # only supports deepseek v3/r1 + if dp_size > 1: + if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 + and is_deepseek_v3_r1): + return True + elif ep_size == 1 and is_deepseek_v3_r1: + return True + return False + + +# TODO(ttanzhiqiang): all_reduce merge +# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce +# Currently, all_reduce_merge is enabled by default in the AllGather, AllGatherEP and NaiveMulticast scenarios of the deepseek model. +def get_all_reduce_merge_state(ep_size: int, is_deepseek_v3_r1: bool): + # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep + # only supports deepseek v3/r1 + if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1 + and is_deepseek_v3_r1): + return True + elif ep_size == 1 and is_deepseek_v3_r1: + return True + return False + + +def register_ascend_customop(): + """Register Ascend CustomOP + + NOTE: if the register branch requires model type, please use `vllm.config.get_current_vllm_config`, + and ensure this will execute after model config is initilazed. + """ + global _ASCEND_CUSTOMOP_IS_REIGISTERED + if _ASCEND_CUSTOMOP_IS_REIGISTERED: + return + from vllm.model_executor.custom_op import CustomOp + + from vllm_ascend.ops.activation import AscendQuickGELU, AscendSiluAndMul + from vllm_ascend.ops.linear import (AscendMlpColumnParallelLinear, + AscendMlpMergedColumnParallelLinear, + AscendMlpRowParallelLinear) + from vllm_ascend.ops.rotary_embedding import ( + AscendDeepseekScalingRotaryEmbedding, AscendRotaryEmbedding) + from vllm_ascend.ops.vocab_parallel_embedding import ( + AscendLogitsProcessor, AscendParallelLMHead, + AscendVocabParallelEmbedding) + CustomOp.register_oot(_decorated_op_cls=AscendQuickGELU, name="QuickGELU") + CustomOp.register_oot(_decorated_op_cls=AscendSiluAndMul, + name="SiluAndMul") + CustomOp.register_oot(_decorated_op_cls=AscendRotaryEmbedding, + name="RotaryEmbedding") + CustomOp.register_oot( + _decorated_op_cls=AscendDeepseekScalingRotaryEmbedding, + name="DeepseekScalingRotaryEmbedding") + CustomOp.register_oot(_decorated_op_cls=AscendVocabParallelEmbedding, + name="VocabParallelEmbedding") + CustomOp.register_oot(_decorated_op_cls=AscendParallelLMHead, + name="ParallelLMHead") + CustomOp.register_oot(_decorated_op_cls=AscendLogitsProcessor, + name="LogitsProcessor") + if envs_ascend.VLLM_ASCEND_ENABLE_MLP_OPTIMIZE: + CustomOp.register_oot(_decorated_op_cls=AscendMlpColumnParallelLinear, + name="ColumnParallelLinear") + CustomOp.register_oot(_decorated_op_cls=AscendMlpRowParallelLinear, + name="RowParallelLinear") + CustomOp.register_oot( + _decorated_op_cls=AscendMlpMergedColumnParallelLinear, + name="MergedColumnParallelLinear") + + from vllm_ascend.ops.layernorm import AscendRMSNorm + CustomOp.register_oot(_decorated_op_cls=AscendRMSNorm, name="RMSNorm") + + from vllm_ascend.ops.common_fused_moe import AscendFusedMoE + CustomOp.register_oot(_decorated_op_cls=AscendFusedMoE, name="FusedMoE") + + # NOTE: Keep this at last to ensure all custom actions are registered + _ASCEND_CUSTOMOP_IS_REIGISTERED = True + + +# TODO(zzzzwwjj): Currently there is no clear SOC_VERSION policy for A2 and A3 in CANN. +# So we get the version dynamically. In the future, we should get the version info from _build_info like 310p does. +class AscendSocVersion(Enum): + A2 = 0 + A3 = 1 + UNDEFINED = 2 + + +_ascend_soc_version = None + + +def init_ascend_soc_version(): + soc_version = torch_npu.npu.get_soc_version() + global _ascend_soc_version + if 220 <= soc_version <= 225: + _ascend_soc_version = AscendSocVersion.A2 + elif 250 <= soc_version <= 255: + _ascend_soc_version = AscendSocVersion.A3 + else: + _ascend_soc_version = AscendSocVersion.UNDEFINED + + +def get_ascend_soc_version(): + global _ascend_soc_version + assert _ascend_soc_version is not None + return _ascend_soc_version + + +def lmhead_tp_enable() -> bool: + return get_ascend_config().lmhead_tensor_parallel_size is not None diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vllm_ascend/worker/eagle_proposer_v1.py b/vllm_ascend/worker/eagle_proposer_v1.py new file mode 100644 index 0000000..479ef1d --- /dev/null +++ b/vllm_ascend/worker/eagle_proposer_v1.py @@ -0,0 +1,398 @@ +# SPDX-License-Identifier: Apache-2.0 +import os + +import torch +import torch.nn as nn +from vllm.attention.layer import Attention +from vllm.config import (CompilationLevel, VllmConfig, + get_layers_from_vllm_config) +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import logger +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal +from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM +from vllm.v1.sample.metadata import SamplingMetadata + +from vllm_ascend.ascend_forward_context import set_ascend_forward_context +from vllm_ascend.attention.attention_mask import AttentionMaskBuilder +from vllm_ascend.attention.attention_v1 import AscendAttentionState +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata + +PADDING_SLOT_ID = -1 + + +class EagleProposer: + + def __init__(self, + vllm_config: VllmConfig, + device: torch.device, + runner=None): + self.vllm_config = vllm_config + self.speculative_config = vllm_config.speculative_config + self.draft_model_config = self.speculative_config.draft_model_config + self.method = self.speculative_config.method + self.runner = runner + self.model_config = vllm_config.model_config + self.dtype = vllm_config.model_config.dtype + self.max_model_len = vllm_config.model_config.max_model_len + self.block_size = vllm_config.cache_config.block_size + self.num_speculative_tokens = ( + self.speculative_config.num_speculative_tokens) + self.max_num_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens) + self.device = device + # We need to get the hidden size from the draft model config because + # the draft model's hidden size can be different from the target model's + # hidden size (e.g., Llama 3.3 70B). + self.hidden_size = self.draft_model_config.get_hidden_size() + + self.use_cuda_graph = (self.vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE and + not self.vllm_config.model_config.enforce_eager) + self.cudagraph_batch_sizes = list( + reversed( + self.vllm_config.compilation_config.cudagraph_capture_sizes)) + + # persistent buffers for cuda graph + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=device) + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=device) + self.hidden_states = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=device) + # We need +1 here because the arange is used to set query_start_loc, + # which has one more element than batch_size. + self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs + + 1, + device=device, + dtype=torch.int32) + mask_len = os.getenv("PAGED_ATTENTION_MASK_LEN", 10000) + self.attn_mask_len = min(self.model_config.max_model_len, + int(mask_len)) + self.attn_mask_builder = AttentionMaskBuilder(self.attn_mask_len, + self.dtype) + + def _make_attention_mask( + self, + seq_lens, + position, + ) -> torch.Tensor: + return self.attn_mask_builder.get_splitfuse_attn_mask( + seq_lens, position, self.dtype, self.device) + + def propose( + self, + # [num_tokens] + target_token_ids: torch.Tensor, + # [num_tokens] + target_positions: torch.Tensor, + # [num_tokens, hidden_size] + target_hidden_states: torch.Tensor, + # [num_tokens] + target_slot_mapping: torch.Tensor, + # [batch_size] + next_token_ids: torch.Tensor, + # [batch_size + 1] starting with 0 + cu_num_tokens: torch.Tensor, + # [batch_size, max_num_blocks_per_req] + block_table: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + device = cu_num_tokens.device + cu_num_tokens = cu_num_tokens.cpu() + block_table = block_table.cpu() + num_tokens = target_token_ids.shape[0] + batch_size = next_token_ids.shape[0] + last_token_indices = cu_num_tokens[1:] - 1 + target_positions = target_positions.cpu() + if self.method == "eagle3": + assert isinstance(self.model, Eagle3LlamaForCausalLM) + target_hidden_states = self.model.combine_hidden_states( + target_hidden_states) + assert target_hidden_states.shape[-1] == self.hidden_size + + # Shift the input ids by one token. + # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] + self.input_ids[:num_tokens - 1] = target_token_ids[1:] + # Replace the last token with the next token. + # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4] + self.input_ids[last_token_indices] = next_token_ids[0] + + query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1] + max_query_len = query_lens.max().item() + + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=self.runner.query_start_loc[:batch_size + 1], + query_start_loc_cpu=self.runner.query_start_loc_cpu[:batch_size + + 1], + seq_lens_cpu=self.runner.seq_lens_cpu, + max_query_len=max_query_len, + num_reqs=batch_size, + num_actual_tokens=num_tokens, + actual_seq_lengths_q=self.runner.actual_seq_lengths_q, + block_table_tensor=self.runner.input_batch.block_table[0]. + get_device_tensor(), + slot_mapping_cpu=target_slot_mapping, + positions=target_positions, + attn_mask=self.runner.attn_mask, + spec_attn_mask=self.runner.spec_attn_mask, + attn_state=self.runner.attn_state, + decode_token_per_req=self.runner.decode_token_per_req, + ) + # FIXME(woosuk): The below two ops cause synchronization. Optimize. + attn_metadata = self.runner.attn_metadata_builder.build( + common_attn_metadata, self.runner.model) + if self.use_cuda_graph and \ + num_tokens <= self.cudagraph_batch_sizes[-1]: + num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) + else: + num_input_tokens = num_tokens + # copy inputs to buffer for cudagraph + self.positions[:num_tokens] = target_positions.to(device) + self.hidden_states[:num_tokens] = target_hidden_states + attn_metadata.block_tables = block_table.to(device) + with set_ascend_forward_context(attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens): + last_hidden_states, hidden_states = self.model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + hidden_states=self.hidden_states[:num_input_tokens], + ) + sample_hidden_states = last_hidden_states[last_token_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + draft_token_ids = logits.argmax(dim=-1) + + # Early exit if there is only one draft token to be generated. + if self.num_speculative_tokens == 1: + # [batch_size, 1] + return draft_token_ids.view(-1, 1) + + # Generate the remaining draft tokens. + draft_token_ids_tensor = torch.zeros( + (self.num_speculative_tokens, *draft_token_ids.shape), + dtype=draft_token_ids.dtype) + draft_token_ids_tensor[0] = draft_token_ids + + positions_cpu = target_positions[last_token_indices].cpu().to( + torch.int64) + hidden_states = hidden_states[last_token_indices] + if self.use_cuda_graph and \ + batch_size <= self.cudagraph_batch_sizes[-1]: + input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) + else: + input_batch_size = batch_size + attn_metadata.num_actual_tokens = batch_size + attn_metadata.max_query_len = 1 + attn_metadata.query_start_loc = self.arange[:batch_size + 1] + + if self.num_speculative_tokens > 2: + raise ValueError("Speculative tokens > 2 are not supported yet.") + + attn_metadata.attn_state = AscendAttentionState.ChunkedPrefill + for now_speculative in range(self.num_speculative_tokens - 1): + # Update the inputs. + # cast to int32 is crucial when eagle model is compiled. + # tensor.argmax() returns int64 by default. + input_ids = draft_token_ids_tensor[now_speculative].to(device) + positions_cpu += 1 + + # NOTE(woosuk): We should handle the case where the draft model + # generates tokens beyond the max model length. Since it is complex + # to remove such requests from the batch, we keep them in the batch + # but adjust the position ids and slot mappings to avoid the + # out-of-range access during the model execution. The draft tokens + # generated with this adjustment should be ignored. + exceeds_max_model_len = positions_cpu >= self.max_model_len + # Mask out the position ids that exceed the max model length. + # Otherwise, we may get out-of-range error in RoPE. + clamped_positions_cpu = torch.where(exceeds_max_model_len, 0, + positions_cpu) + clamped_positions = clamped_positions_cpu.to(device) + + # TODO: Increment the sequence lengths. + + attn_metadata.seq_lens += 1 + # TODO: Consider max model length. + # attn_metadata.max_seq_len = min(attn_metadata.max_seq_len, + # self.max_model_len) + # For the requests that exceed the max model length, we set the + # TODO: sequence length to 1 to minimize their overheads in attention. + + # Compute the slot mapping. + block_numbers = (clamped_positions_cpu // self.block_size) + block_ids = block_table.gather(dim=1, + index=block_numbers.view(-1, 1)) + block_ids = block_ids.view(-1) + slot_mapping_cpu = (block_ids * self.block_size + + clamped_positions_cpu % self.block_size) + + # Mask out the slot mappings that exceed the max model length. + # Otherwise, the KV cache will be inadvertently updated with the + # padding tokens. + slot_mapping_cpu.masked_fill_(exceeds_max_model_len, + PADDING_SLOT_ID) + # NOTE: ASCEND slot_mapping must on cpu + attn_metadata.slot_mapping = slot_mapping_cpu.to( + torch.int32).to(device) + # copy inputs to buffer for cudagraph + self.input_ids[:batch_size] = input_ids + self.positions[:batch_size] = clamped_positions + self.hidden_states[:batch_size] = hidden_states + positions = positions_cpu.to(device) + attn_mask = self._make_attention_mask( + seq_lens=attn_metadata.seq_lens, + position=positions, + ) + attn_metadata.attn_mask = attn_mask + attn_metadata.block_tables = block_table.to(device) + # Run the model. + with set_ascend_forward_context(attn_metadata, + self.vllm_config, + num_tokens=input_batch_size): + + last_hidden_states, hidden_states = self.model( + input_ids=self.input_ids[:input_batch_size], + positions=self.positions[:input_batch_size], + hidden_states=self.hidden_states[:input_batch_size], + ) + hidden_states = hidden_states[:batch_size] + logits = self.model.compute_logits(last_hidden_states[:batch_size], + None) + + # TODO(wenlong): get more than one token for tree attention + draft_token_ids = logits.argmax(dim=-1) + draft_token_ids_tensor[now_speculative + 1] = draft_token_ids.cpu() + + # [batch_size, num_speculative_tokens] + draft_token_ids = draft_token_ids_tensor.swapaxes(0, 1) + return draft_token_ids + + @staticmethod + def prepare_inputs( + # [batch_size + 1] + cu_target_query_lens: torch.Tensor, + # [batch_size] + num_rejected_tokens: torch.Tensor, + num_tokens: int, + ) -> tuple[torch.Tensor, torch.Tensor]: + # cu_target_query_lens: [0, a, a + b, a + b + c] + # num_rejected_tokens: [n1, n2, n3] + # num_tokens_per_req: [a - n1, b - n2, c - n3] + # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] + # token_indices: [0, 1, ..., a - n1 - 1, + # a, a + 1, ..., a + b - n2 - 1, + # a + b, a + b + 1, ..., a + b + c - n3 - 1] + + # [0, a, a + b, a + b + c] -> [a, b, c] + query_len_per_req = (cu_target_query_lens[1:] - + cu_target_query_lens[:-1]) + # [a, b, c] -> [a - n1, b - n2, c - n3] + num_tokens_per_req = query_len_per_req - num_rejected_tokens + + # [a - n1, b - n2, c - n3] -> + # [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] + cu_num_tokens = torch.zeros_like(cu_target_query_lens) + torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) + token_indices = torch.empty( + num_tokens, + dtype=torch.int32, + device=cu_target_query_lens.device, + ) + BLOCK_SIZE = 1024 + prepare_eagle_input_sequential( + token_indices, + cu_target_query_lens, + cu_num_tokens, + block_size=BLOCK_SIZE, + ) + return cu_num_tokens, token_indices + + def load_model(self, target_model: nn.Module) -> None: + draft_model_config = \ + self.vllm_config.speculative_config.draft_model_config + target_attn_layer_names = set( + get_layers_from_vllm_config(self.vllm_config, Attention).keys()) + + self.model = get_model(vllm_config=self.vllm_config, + model_config=draft_model_config) + + draft_attn_layer_names = ( + get_layers_from_vllm_config(self.vllm_config, Attention).keys() - + target_attn_layer_names) + + self.attn_layer_names = list(draft_attn_layer_names) + self.attn_layer_name = next(iter(draft_attn_layer_names)) + # share embed_tokens with the target model if needed + if get_pp_group().world_size == 1: + logger.info( + "The EAGLE head shares the same vocab embedding" \ + " with the target model." + ) + self.model.model.embed_tokens = target_model.model.embed_tokens + else: + logger.info( + "Since PP > 1, the EAGLE head loaded its own vocab embedding" \ + " weights instead of sharing them with the target model." + ) + + # share lm_head with the target model if needed + # some model definition do not define lm_head explicitly + # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM + if self.vllm_config.speculative_config.method != "eagle3" and \ + hasattr(target_model, "lm_head"): + logger.info("Loading EAGLE LM head weights from the target model.") + if supports_multimodal(target_model): + self.model.lm_head = target_model.get_language_model().lm_head + else: + self.model.lm_head = target_model.lm_head + + @torch.inference_mode() + def dummy_run( + self, + num_tokens: int, + ) -> None: + with set_ascend_forward_context(None, + self.vllm_config, + num_tokens=num_tokens): + self.model( + input_ids=self.input_ids[:num_tokens], + positions=self.positions[:num_tokens], + hidden_states=self.hidden_states[:num_tokens], + ) + + +def prepare_eagle_input_sequential(out_tensor: torch.Tensor, + cu_query_lens: torch.Tensor, + cu_num_tokens: torch.Tensor, + block_size: int): + num_programs = len(cu_num_tokens) - 1 + for pid in range(num_programs): + start_pos = cu_num_tokens[pid].item() + end_pos = cu_num_tokens[pid + 1].item() + num_tokens = end_pos - start_pos + index_start = cu_query_lens[pid].item() + num_blocks = int( + torch.ceil(torch.tensor(num_tokens / block_size)).item()) + + for i in range(num_blocks): + offset_tensor = torch.arange(0, + block_size, + dtype=torch.int32, + device=out_tensor.device) + global_start_offset = i * block_size + target_indices = torch.tensor( + start_pos + global_start_offset, + dtype=torch.int32, + device=out_tensor.device) + offset_tensor + values_to_store = torch.tensor( + index_start, dtype=torch.int32, + device=out_tensor.device) + offset_tensor + mask = (target_indices >= start_pos) & \ + (target_indices < end_pos) & \ + (offset_tensor < num_tokens) + out_tensor[target_indices[mask]] = values_to_store[mask] diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py new file mode 100644 index 0000000..7a9fe1b --- /dev/null +++ b/vllm_ascend/worker/model_runner_v1.py @@ -0,0 +1,2883 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/gpu_model_runner.py +# + +import copy +import gc +import math +import time +from contextlib import contextmanager, nullcontext +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Optional, Union, cast + +import numpy as np +import numpy.typing as npt +import torch +import torch._dynamo.cache_size +import torch.distributed as dist +import torch.nn as nn +from tqdm import tqdm # type: ignore +from vllm.attention import AttentionType, get_attn_backend +from vllm.attention.layer import Attention +from vllm.compilation.counter import compilation_counter +from vllm.compilation.monitor import set_cudagraph_capturing_enabled +from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.distributed.kv_transfer import (get_kv_transfer_group, + has_kv_transfer_group) +from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 +from vllm.distributed.parallel_state import (get_dp_group, get_pp_group, + get_tp_group, + is_global_first_rank) +from vllm.forward_context import BatchDescriptor, get_forward_context +from vllm.logger import logger +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models.interfaces import supports_transcription +from vllm.model_executor.models.interfaces_base import ( + VllmModelForPooling, is_pooling_model, is_text_generation_model) +from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange +from vllm.multimodal.utils import group_mm_kwargs_by_modality +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import SamplingType +from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.tasks import GenerationTask, PoolingTask, SupportedTask +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, + LazyLoader, cdiv, is_pin_memory_available) +from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheSpec) +from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, + ModelRunnerOutput) +from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import build_logitsprocs +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata +from vllm.v1.spec_decode.ngram_proposer import NgramProposer +from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorOutput +from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin +from vllm.v1.worker.utils import (bind_kv_cache, gather_mm_placeholders, + sanity_check_mm_encoder_outputs, + scatter_mm_placeholders) + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import set_ascend_forward_context +from vllm_ascend.attention.attention_mask import AttentionMaskBuilder +from vllm_ascend.attention.attention_v1 import (AscendAttentionState, + AscendMetadata) +from vllm_ascend.attention.mla_v1 import AscendMLAMetadata +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.compilation.acl_graph import ACLGraphWrapper +from vllm_ascend.multistream.ms_split import compute_split_seq_index +from vllm_ascend.platform import NPUPlatform +from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler +from vllm_ascend.torchair.torchair_attention import AscendTorchairMetadata +from vllm_ascend.torchair.torchair_mla import AscendMLATorchairMetadata +from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, + AscendSocVersion, ProfileExecuteDuration, + get_ascend_soc_version, is_310p, + lmhead_tp_enable, vllm_version_is) +from vllm_ascend.worker.eagle_proposer_v1 import EagleProposer +from vllm_ascend.worker.mtp_proposer_v1 import MtpProposer +from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch + +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + +if TYPE_CHECKING: + import xgrammar as xgr # type: ignore[import-untyped] + from vllm.v1.core.sched.output import SchedulerOutput +else: + xgr = LazyLoader("xgr", globals(), "xgrammar") + +import torch_npu + +import vllm_ascend.envs as envs_ascend + +# if true, allow tensor initialization and casting with internal format (e.g., NZ) +torch.npu.config.allow_internal_format = True + +if is_310p(): + torch_npu.npu.set_compile_mode(jit_compile=False) + ACL_FORMAT = ACL_FORMAT_FRACTAL_NZ +else: + ACL_FORMAT = ACL_FORMAT_FRACTAL_ND + + +@dataclass +class GraphCaptureContext: + stream: torch.npu.Stream + + +@contextmanager +def graph_capture(device: torch.device): + """ + `graph_capture` is a context manager which should surround the code that + is capturing the NPU graph. Its main purpose is to ensure that the + some operations will be run after the graph is captured, before the graph + is replayed. It returns a `GraphCaptureContext` object which contains the + necessary data for the graph capture. Currently, it only contains the + stream that the graph capture is running on. This stream is set to the + current NPU stream when the context manager is entered and reset to the + default stream when the context manager is exited. This is to ensure that + the graph capture is running on a separate stream from the default stream, + in order to explicitly distinguish the kernels to capture + from other kernels possibly launched on background in the default stream. + """ + graph_capture_context = GraphCaptureContext( + torch.npu.Stream(device=device)) + stream = graph_capture_context.stream + + # we use nullcontext now + maybe_ca_context = nullcontext() + + # ensure all initialization operations complete before attempting to + # capture the graph on another stream + curr_stream = torch.npu.current_stream() + if curr_stream != stream: + stream.wait_stream(curr_stream) + + with torch.npu.stream(stream), maybe_ca_context: + yield graph_capture_context + + +class NPUModelRunner(LoRAModelRunnerMixin): + + def __init__(self, vllm_config: VllmConfig, device: torch.device): + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.compilation_config = vllm_config.compilation_config + self.load_config = vllm_config.load_config + self.lora_config = vllm_config.lora_config + self.parallel_config = vllm_config.parallel_config + self.pin_memory = is_pin_memory_available() + self.scheduler_config = vllm_config.scheduler_config + self.speculative_config = vllm_config.speculative_config + self.block_size = vllm_config.cache_config.block_size + self.max_num_blocks_per_req = cdiv(self.model_config.max_model_len, + self.block_size) + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens + self.max_num_reqs = self.scheduler_config.max_num_seqs + self.dp_size = vllm_config.parallel_config.data_parallel_size + self.dp_rank = vllm_config.parallel_config.data_parallel_rank + self.device = device + self.dtype = self.model_config.dtype + if envs_ascend.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION: + # TODO: drop the env config to use ascend sampler by default + from vllm_ascend.sample.sampler import AscendSampler + + self.sampler = AscendSampler() + else: + from vllm.v1.sample.sampler import Sampler + + self.sampler = Sampler() + + # Lazy initialization, these will be set after __init__ + self.kv_caches: List[torch.Tensor] = [] + # TODO: remove Dict[str, Dict[int, torch.Tensor]] type after 0.10.1.1 + self.encoder_cache: Union[Dict[str, Dict[int, torch.Tensor]], + Dict[str, torch.Tensor]] = {} + self.attn_mask = None + self.attn_state = None + self.requests: Dict[str, CachedRequestState] = {} + self.intermediate_tensors: Optional[IntermediateTensors] = None + + ascend_config = get_ascend_config() + if ascend_config.ascend_scheduler_config.enabled: + self.chunked_prefill_enabled = self.scheduler_config.chunked_prefill_enabled + else: + self.chunked_prefill_enabled = True + + if self.cache_config.cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + self.cache_config.cache_dtype] + + self.is_multimodal_model = self.model_config.is_multimodal_model + self.is_pooling_model = self.model_config.pooler_config is not None + if self.is_multimodal_model: + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.model_config.get_hidden_size()), + dtype=self.dtype, + device=self.device) + + # Set up Attention + self.attn_backend = get_attn_backend( + 0, + self.dtype, + None, + self.block_size, + self.model_config.is_attention_free, + use_mla=self.model_config.use_mla, + ) + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + vllm_config, device) + self.attn_mask_builder = AttentionMaskBuilder( + self.model_config.max_model_len, self.dtype) + + # Set up speculative decoding. + self.use_aux_hidden_state_outputs = False + self.use_spec_decode = False + self.spec_attn_mask = None + self.use_eagle = False + self.drafter: Optional[Union[NgramProposer, EagleProposer, + MtpProposer]] = None + self.actual_seq_lengths_q = [] + self.decode_token_per_req = 1 + if self.speculative_config: + self.use_spec_decode = True + spec_token_num = self.speculative_config.num_speculative_tokens + assert spec_token_num > 0 + self.decode_token_per_req = 1 + spec_token_num + self.actual_seq_lengths_q = [ + len for len in + range(self.decode_token_per_req, self.max_num_tokens + + 1, self.decode_token_per_req) + ] + self.spec_attn_mask = torch.triu(torch.ones(2048, + 2048, + dtype=torch.bool), + diagonal=1).to(self.device) + if get_pp_group().is_last_rank: + if self.speculative_config.method == "ngram": + self.drafter = NgramProposer(self.vllm_config) + elif self.speculative_config.method in ["eagle", "eagle3"]: + self.use_eagle = True + self.drafter = EagleProposer(self.vllm_config, self.device, + self) # type: ignore + if self.speculative_config.method == "eagle3": + self.use_aux_hidden_state_outputs = True + elif self.speculative_config.method == 'deepseek_mtp': + self.drafter = MtpProposer(self.vllm_config, self) + else: + raise ValueError("Unknown speculative decoding method: " + f"{self.speculative_config.method}") + self.rejection_sampler = AscendRejectionSampler() + + # Persistent batch. + self.input_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device=self.device) + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) + self.query_start_loc = torch.zeros(self.max_num_reqs + 1, + dtype=torch.int32, + device=self.device) + self.seq_lens = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device=self.device) + + self.uses_mrope = self.model_config.uses_mrope + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros( + (3, self.max_num_tokens + 1), + dtype=torch.int64, + device="cpu", + pin_memory=True) + self.mrope_positions_np = self.mrope_positions_cpu.numpy() + + # OPTIMIZATION: Cache the tensors rather than creating them every step. + self.arange_np: npt.NDArray[np.int32] = np.arange(max( + self.max_num_reqs + 1, self.model_config.max_model_len, + self.max_num_tokens), + dtype=np.int32) + # NOTE(woosuk): These tensors are "stateless", i.e., they are literally + # a faster version of creating a new tensor every time. Thus, we should + # not make any assumptions about the values in these tensors. + self.input_ids_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device="cpu", + pin_memory=True) + self.positions_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device="cpu", + pin_memory=True) + self.positions_np = self.positions_cpu.numpy() + + self.slot_mapping_cpu = torch.zeros(self.max_num_tokens, + dtype=torch.int32, + device="cpu", + pin_memory=True) + self.slot_mapping_np = self.slot_mapping_cpu.numpy() + self.query_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + pin_memory=True) + self.query_start_loc_np = self.query_start_loc_cpu.numpy() + self.seq_lens_cpu = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=True) + self.seq_lens_np = self.seq_lens_cpu.numpy() + + self.use_aclgraph = self._use_aclgraph() + self.aclgraph_batch_sizes = list( + reversed(self.compilation_config.cudagraph_capture_sizes)) + + self.uniform_decode_query_len = 1 if not self.speculative_config else \ + 1 + self.speculative_config.num_speculative_tokens + # aclgraph dispatcher for runtime aclgraph dispatching. + self.aclgraph_dispatcher = CudagraphDispatcher(self.vllm_config) + # Cached outputs. + self._draft_token_ids: Optional[Union[list[list[int]], + torch.Tensor]] = None + + # NOTE: we need to use `in_profile_run` to determine whether `enable_force_load_balance` is True + self.in_profile_run = False + + # kv role + self.is_kv_producer = False + self.is_kv_consumer = False + if vllm_config.kv_transfer_config is not None: + self.is_kv_producer = vllm_config.kv_transfer_config.is_kv_producer + self.is_kv_consumer = vllm_config.kv_transfer_config.is_kv_consumer + + self.mc2_tokens_capacity = 512 * self.parallel_config.tensor_parallel_size + self.reserved_mc2_mask = torch.zeros( + self.mc2_tokens_capacity, + dtype=torch.bool, + device=self.device, + ) + + def _use_aclgraph(self) -> bool: + return self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE and self.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager + + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + # Remove finished requests from the cached states. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + self.encoder_cache.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + for req_id in scheduler_output.finished_req_ids: + self.input_batch.remove_request(req_id) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) + else: + for mm_hash in scheduler_output.free_encoder_mm_hashes: + self.encoder_cache.pop(mm_hash, None) + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: + self.input_batch.remove_request(req_id) + + req_ids_to_add: list[str] = [] + # Add new requests to the cached states. + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + sampling_params = new_req_data.sampling_params + pooling_params = new_req_data.pooling_params + + if sampling_params and \ + sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + + if pooling_params: + assert (task := pooling_params.task) is not None, ( + "You did not set `task` in the API") + model = cast(VllmModelForPooling, self.get_model()) + to_update = model.pooler.get_pooling_updates(task) + to_update.apply(pooling_params) + + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + mm_kwargs=new_req_data.mm_kwargs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + pooling_params=pooling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + **({ + "mm_hashes": new_req_data.mm_hashes + } if not (vllm_version_is("0.10.1.1") + or vllm_version_is("0.10.1")) else { + "mm_hashes": None + }), + ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + second_per_grid_ts = [] + audio_feature_lengths = [] + use_audio_in_video = False + for mm_item in self.requests[req_id].mm_kwargs: + mm_input = mm_item.get_data() + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.append( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.append( + mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.append( + mm_input["second_per_grid_ts"]) + if mm_input.get("audio_feature_lengths") is not None: + audio_feature_lengths.append( + mm_input["audio_feature_lengths"]) + if mm_input.get("use_audio_in_video") is True: + use_audio_in_video = True + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + audio_feature_lengths=audio_feature_lengths, + use_audio_in_video=use_audio_in_video, + ) + + req_ids_to_add.append(req_id) + + # Update the states of the running/resumed requests. + is_last_rank = get_pp_group().is_last_rank + req_data = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(req_data.req_ids): + req_state = self.requests[req_id] + num_computed_tokens = req_data.num_computed_tokens[i] + new_block_ids = req_data.new_block_ids[i] + resumed_from_preemption = req_data.resumed_from_preemption[i] + + # Update the cached states. + req_state.num_computed_tokens = num_computed_tokens + + if not is_last_rank: + # When using PP, the scheduler sends the sampled tokens back, + # because there's no direct communication between the first- + # stage worker and the last-stage worker. + new_token_ids = req_data.new_token_ids[i] + # Add the sampled token(s) from the previous step (if any). + # This doesn't include "unverified" tokens like spec tokens. + num_new_tokens = (num_computed_tokens + len(new_token_ids) - + req_state.num_tokens) + if num_new_tokens == 1: + # Avoid slicing list in most common case. + req_state.output_token_ids.append(new_token_ids[-1]) + elif num_new_tokens > 0: + req_state.output_token_ids.extend( + new_token_ids[-num_new_tokens:]) + + # Update the block IDs. + if not resumed_from_preemption: + if new_block_ids is not None: + # Append the new blocks to the existing block IDs. + for block_ids, new_ids in zip(req_state.block_ids, + new_block_ids): + block_ids.extend(new_ids) + else: + assert new_block_ids is not None + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + req_ids_to_add.append(req_id) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + num_computed_tokens) + if new_block_ids is not None: + self.input_batch.block_table.append_row( + new_block_ids, req_index) + + # For the last rank, we don't need to update the token_ids_cpu + # because the sampled tokens are already cached. + if not is_last_rank: + # Add new_token_ids to token_ids_cpu. + start_token_index = num_computed_tokens + end_token_index = num_computed_tokens + len(new_token_ids) + self.input_batch.token_ids_cpu[ + req_index, + start_token_index:end_token_index] = new_token_ids + self.input_batch.num_tokens_no_spec[ + req_index] = end_token_index + self.input_batch.num_tokens[req_index] = end_token_index + + # Add spec_token_ids to token_ids_cpu. + spec_token_ids = ( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, ())) + if spec_token_ids: + num_spec_tokens = len(spec_token_ids) + start_index = self.input_batch.num_tokens_no_spec[req_index] + end_token_index = start_index + num_spec_tokens + self.input_batch.token_ids_cpu[ + req_index, start_index:end_token_index] = spec_token_ids + # NOTE(woosuk): `num_tokens` here may include spec tokens. + self.input_batch.num_tokens[req_index] += num_spec_tokens + + # Add the new or resumed requests to the persistent batch. + # The smaller empty indices are filled first. + for req_id in req_ids_to_add: + req_state = self.requests[req_id] + self.input_batch.add_request(req_state) + + # Condense the batched states if there are gaps left by removed requests + self.input_batch.condense() + + # Refresh batch metadata with any pending updates. + self.input_batch.refresh_metadata() + + def _sync_metadata_across_dp( + self, num_tokens: int, with_prefill: bool, enable_dbo: bool + ) -> tuple[int, Optional[torch.Tensor], bool, bool]: + if self.dp_size == 1 or self.vllm_config.model_config.enforce_eager: + return num_tokens, None, with_prefill, enable_dbo + + # Sync num_tokens, with_prefill, enable_dbo across dp ranks + num_tokens_tensor = torch.tensor([ + num_tokens if i == self.dp_rank else 0 for i in range(self.dp_size) + ], + dtype=torch.int32, + device="npu") + + flags_tensor = torch.tensor( + [int(with_prefill), int(not enable_dbo)], + dtype=torch.int32, + device="npu") + + packed_tensor = torch.cat([num_tokens_tensor, flags_tensor]) + + dist.all_reduce(packed_tensor, group=get_dp_group().device_group) + + # Unpack the results + num_tokens_across_dp = packed_tensor[:-2] + synced_flags = packed_tensor[-2:] + + max_tokens_across_dp = torch.max(num_tokens_across_dp).item() + global_with_prefill = bool(synced_flags[0]) + global_enable_dbo = not bool(synced_flags[1]) + + # Create a tensor for num_tokens_after_padding + num_tokens_after_padding = torch.tensor([max_tokens_across_dp] * + self.dp_size, + device="npu", + dtype=torch.int32) + + return max_tokens_across_dp, num_tokens_after_padding, global_with_prefill, global_enable_dbo + + def _check_dbo_is_valid(self, query_lens: torch.Tensor, + attn_state: AscendAttentionState, + num_tokens: int) -> bool: + # do the checks for dp + dbo + if attn_state in [ + AscendAttentionState.DecodeOnly, + AscendAttentionState.SpecDecoding + ]: + return False + # considering the case that one dp rank may enable dbo while others may not + if not self.vllm_config.model_config.use_mla or not envs_ascend.VLLM_ASCEND_ENABLE_DBO: + return False + # TODO: remove it if token-level microbatch is enabled + [token_index, + seq_index] = compute_split_seq_index(query_lens, attn_state, + num_tokens) + if token_index == 0 or seq_index == 0 or seq_index == len( + query_lens) or num_tokens < 256: + return False + return True + + def get_eagle_atten_dict( + self, + scheduler_output: "SchedulerOutput", + ) -> dict[str, Union[AscendMetadata, AscendMLAMetadata, + AscendTorchairMetadata, AscendMLATorchairMetadata]]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit_block_table(num_reqs) + + # Get the number of scheduled tokens for each request. + req_ids = self.input_batch.req_ids + tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] + num_scheduled_tokens = np.array(tokens, dtype=np.int32) + max_num_scheduled_tokens = max(tokens) + self.query_lens = torch.from_numpy(num_scheduled_tokens) + # Get request indices. + # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + + # cu_num_tokens: [2, 5, 3] -> [2, 7, 10] + # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + cu_num_tokens, arange = self._get_cumsum_and_arange( + num_scheduled_tokens) + + # Get positions. + positions_np = self.positions_np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + + # NOTE(woosuk): We use torch.index_select instead of np.take here + # because torch.index_select is much faster than np.take for large + # tensors. + torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), + 0, + torch.from_numpy(token_indices), + out=self.input_ids_cpu[:total_num_scheduled_tokens]) + + # Prepare the attention metadata for each KV cache group and make layers + # in the same group share the same metadata. + # NOTE(Chen): there is exactly one KV cache group that contains all + # attetnion layers in the model for now, so the current logic for + # getting attn_metadata is not related to kv_cache_group information. + # Will extend this part to support multiple KV cache groups later. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + block_size = kv_cache_group_spec.kv_cache_spec.block_size + block_table = self.input_batch.block_table[kv_cache_group_id] + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` + # here because M (max_model_len) is not necessarily divisible by + # block_size. + block_table_indices = ( + req_indices * block_table.max_num_blocks_per_req + + positions_np // block_size) + block_table_cpu = block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten( + )[block_table_indices].numpy() + block_offsets = positions_np % block_size + np.add( + block_numbers * block_size, + block_offsets, + out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) + + # Prepare the attention metadata. + self.query_start_loc_np[0] = 0 + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + + # Copy the tensors to the NPU. + self.input_ids[:total_num_scheduled_tokens].copy_( + self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) + + self.query_start_loc[:num_reqs + 1].copy_( + self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True) + self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], + non_blocking=True) + + # Fill unused with -1. Needed for reshape_and_cache + self.seq_lens[num_reqs:].fill_(0) + self.query_start_loc[num_reqs + 1:].fill_(-1) + + attn_metadata: dict[str, Union[AscendMetadata, AscendMLAMetadata, + AscendTorchairMetadata, + AscendMLATorchairMetadata]] = {} + # Prepare the attention metadata for each KV cache group and make layers + # in the same group share the same metadata. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=self.query_start_loc[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1], + seq_lens_cpu=self.seq_lens_cpu, + num_reqs=num_reqs, + max_query_len=max_num_scheduled_tokens, + num_actual_tokens=total_num_scheduled_tokens, + actual_seq_lengths_q=self.actual_seq_lengths_q, + block_table_tensor=self.input_batch.block_table[0]. + get_device_tensor(), + slot_mapping_cpu=self.slot_mapping_cpu, + positions=self.positions, + attn_mask=self.attn_mask, + spec_attn_mask=self.spec_attn_mask, + attn_state=self.attn_state, + decode_token_per_req=self.decode_token_per_req, + ) + attn_metadata_i = self.attn_metadata_builder.build( + common_attn_metadata, self.get_model()) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i + + return attn_metadata + + def get_model(self) -> nn.Module: + # get raw model out of the aclgraph wrapper. + if isinstance(self.model, ACLGraphWrapper): + return self.model.unwrap() + return self.model + + def get_supported_generation_tasks(self) -> "list[GenerationTask]": + model = self.get_model() + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(model): + supported_tasks.append("generate") + + if supports_transcription(model): + if model.supports_transcription_only: + return ["transcription"] + + supported_tasks.append("transcription") + + return supported_tasks + + def get_supported_tasks(self) -> "tuple[SupportedTask, ...]": + tasks = list[SupportedTask]() + + if self.model_config.runner_type == "generate": + tasks.extend(self.get_supported_generation_tasks()) + if self.model_config.runner_type == "pooling": + tasks.extend(self.get_supported_pooling_tasks()) + + return tuple(tasks) + + def _make_attention_mask(self, seq_lens, position, + attn_state) -> torch.Tensor: + # Chunk Prefill situation. + if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla: + return self.attn_mask_builder.get_splitfuse_attn_mask( + seq_lens, position, self.dtype, self.device) + # Prefill without cache situation. + elif attn_state == AscendAttentionState.PrefillNoCache: + max_seq_len = max(seq_lens, default=0) + return self.attn_mask_builder.get_attn_mask( + max_seq_len, self.dtype, self.device) + # Prefill with cache hit. + elif attn_state == AscendAttentionState.PrefillCacheHit: + return self.attn_mask_builder.get_attn_mask( + 128, self.dtype, self.device) + # Decode-only situation. + else: + return None + + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + for index, req_id in enumerate(self.input_batch.req_ids): + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + MRotaryEmbedding.get_next_input_positions_tensor( + out=self.mrope_positions_np, + out_offset=dst_start, + mrope_position_delta=req.mrope_position_delta, + context_len=num_computed_tokens + prompt_part_len, + num_new_tokens=completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + + def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_kwargs = list[MultiModalKwargsItem]() + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + req_ids_pos = list[tuple[str, int, PlaceholderRange]]() + else: + mm_hashes_pos = list[tuple[str, PlaceholderRange]]() + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + for mm_input_id in encoder_input_ids: + mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) + req_ids_pos.append((req_id, mm_input_id, + req_state.mm_positions[mm_input_id])) + else: + for mm_input_id in encoder_input_ids: + # TODO remove this assert after 0.10.1.1 + assert req_state.mm_hashes is not None + mm_hash = req_state.mm_hashes[mm_input_id] + mm_kwargs.append(req_state.mm_kwargs[mm_input_id]) + mm_hashes_pos.append( + (mm_hash, req_state.mm_positions[mm_input_id])) + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + encoder_outputs = [] + for _, num_items, mm_kwargs_group in group_mm_kwargs_by_modality( + mm_kwargs, + device=self.device, + pin_memory=True, + ): + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **mm_kwargs_group) + + sanity_check_mm_encoder_outputs( + curr_group_outputs, + expected_num_items=num_items, + ) + + for output in curr_group_outputs: + encoder_outputs.append(output) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + # Cache the encoder outputs. + for (req_id, input_id, pos_info), output in zip( + req_ids_pos, + encoder_outputs, + ): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + + self.encoder_cache[req_id][input_id] = scatter_mm_placeholders( + output, + is_embed=pos_info.is_embed, + ) + else: + for (mm_hash, pos_info), output in zip(mm_hashes_pos, + encoder_outputs): + self.encoder_cache[mm_hash] = scatter_mm_placeholders( + output, + is_embed=pos_info.is_embed, + ) + + def _gather_mm_embeddings( + self, + scheduler_output: "SchedulerOutput", + ) -> list[torch.Tensor]: + mm_embeds: list[torch.Tensor] = [] + for req_id in self.input_batch.req_ids: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + mm_hashes = req_state.mm_hashes + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info.offset + num_encoder_tokens = pos_info.length + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + else: + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens, + ) + assert start_idx < end_idx + # TODO remove this assert after 0.10.1.1 + assert mm_hashes is not None + mm_hash = mm_hashes[i] + encoder_output = self.encoder_cache.get(mm_hash, None) + assert encoder_output is not None,\ + f"Encoder cache miss for {mm_hash}." + + if (is_embed := pos_info.is_embed) is not None: + is_embed = is_embed[start_idx:end_idx] + + mm_embeds_item = gather_mm_placeholders( + encoder_output[start_idx:end_idx], + is_embed=is_embed, + ) + mm_embeds.append(mm_embeds_item) + return mm_embeds + + def _prepare_inputs( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> tuple[Union[AscendMetadata, AscendMLAMetadata, AscendTorchairMetadata, + AscendMLATorchairMetadata], torch.Tensor, np.ndarray, int, + torch.Tensor, int, torch.Tensor, SpecDecodeMetadata, + Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor]]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + self.attn_metadata_builder.reorder_batch(self.input_batch, + scheduler_output) + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit_block_table(num_reqs) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32) + num_valid_tokens = np.empty(num_reqs, dtype=np.int32) + max_num_scheduled_tokens = 0 + for i, req_id in enumerate(self.input_batch.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens[i] = num_tokens + num_valid_tokens[i] = num_tokens - \ + len(scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + max_num_scheduled_tokens = max(max_num_scheduled_tokens, + num_tokens) + + if (self.use_aclgraph and total_num_scheduled_tokens + <= self.aclgraph_batch_sizes[-1]): + # Add padding to the batch size. + num_input_tokens = self.vllm_config.pad_for_cudagraph( + total_num_scheduled_tokens) + else: + # Eager mode. + num_input_tokens = total_num_scheduled_tokens + + # Get the attention state. + attn_state = self._build_attn_state(num_reqs, num_scheduled_tokens, + num_valid_tokens) + self.attn_state = attn_state # type: ignore + + # Determine if it's a splitfuse batch + with_prefill = attn_state not in [ + AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding + ] + + self.query_lens = torch.from_numpy(num_scheduled_tokens) + enable_dbo = self._check_dbo_is_valid(self.query_lens.tolist(), + attn_state, + total_num_scheduled_tokens) + + # Get info across DP ranks. + # NOTE: maybe_padded_num_tokens is only used when using TorchAir with DP, + # Otherwise, it's just max_tokens_across_dp_cpu + (maybe_padded_num_tokens, num_tokens_across_dp, with_prefill, + enable_dbo) = self._sync_metadata_across_dp(num_input_tokens, + with_prefill, enable_dbo) + + # TODO: Now that num_input_tokens is basically identical with maybe_padded_num_tokens + # We should consider removing maybe_padded_num_tokens later + num_input_tokens = maybe_padded_num_tokens + + # Hot-Swap lora model + if self.lora_config: + self.set_active_loras(self.input_batch, num_scheduled_tokens) + + # Prepare positions + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + cu_num_tokens = np.cumsum(num_scheduled_tokens) + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + + positions_np = self.positions_np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + + self.positions_cpu[total_num_scheduled_tokens:num_input_tokens].zero_() + self.positions[:num_input_tokens].copy_( + self.positions_cpu[:num_input_tokens], non_blocking=True) + positions_cpu = self.positions_cpu[:num_input_tokens] + positions = self.positions[:num_input_tokens] + self.query_lens = torch.from_numpy(num_scheduled_tokens) + + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + seq_lens_cpu = self.seq_lens_cpu[:num_reqs] + + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:total_num_scheduled_tokens]) + + attn_state = self._build_attn_state(num_reqs, num_scheduled_tokens, + num_valid_tokens) + + self.attn_mask = self._make_attention_mask(seq_lens=seq_lens_cpu, + position=positions_cpu, + attn_state=attn_state) + self.attn_state = attn_state # type: ignore + + self.query_start_loc_np[0] = 0 + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + self.query_start_loc[:num_reqs + 1].copy_( + self.query_start_loc_cpu[:num_reqs + 1], non_blocking=True) + self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], + non_blocking=True) + + # Fill unused with -1. Needed for reshape_and_cache + self.seq_lens[num_reqs:].fill_(0) + self.query_start_loc[num_reqs + 1:].fill_(-1) + + self.with_prefill = with_prefill + self.num_tokens_across_dp = num_tokens_across_dp + self._update_graph_pad_size(with_prefill, maybe_padded_num_tokens) + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=self.query_start_loc[:num_reqs + 1], + query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1], + seq_lens_cpu=self.seq_lens_cpu, + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + actual_seq_lengths_q=self.actual_seq_lengths_q, + block_table_tensor=self.input_batch.block_table[0]. + get_device_tensor(), + slot_mapping_cpu=self.slot_mapping_cpu, + positions=self.positions, + attn_mask=self.attn_mask, + spec_attn_mask=self.spec_attn_mask, + attn_state=self.attn_state, + enable_dbo_across_dp=enable_dbo, + is_only_prefill=bool(np.all(num_valid_tokens != 1)), + max_query_len=max_num_scheduled_tokens, + graph_pad_size=self.graph_pad_size, + decode_token_per_req=self.decode_token_per_req, + ) + attn_metadata = self.attn_metadata_builder.build( + common_attn_metadata, self.model) + if self.vllm_config.model_config.use_mla: + attn_metadata.num_input_tokens = num_input_tokens + + # Prepare input_ids + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(), + 0, + torch.from_numpy(token_indices), + out=self.input_ids_cpu[:total_num_scheduled_tokens]) + # Copy the tensors to the NPU. + self.input_ids[:total_num_scheduled_tokens].copy_( + self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) + + # _prepare_inputs may reorder the batch, so we must gather multi + # modal outputs after that to ensure the correct order + if self.is_multimodal_model: + # Run the multimodal encoder if any. + self._execute_mm_encoder(scheduler_output) + mm_embeds = self._gather_mm_embeddings(scheduler_output) + + # NOTE(woosuk): To unify token ids and soft tokens (vision + # embeddings), we always use embeddings (rather than token ids) + # as input to the multimodal model, even when the input is text. + input_ids = self.input_ids[:total_num_scheduled_tokens] + if mm_embeds: + inputs_embeds = self.model.get_input_embeddings( + input_ids, mm_embeds) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:total_num_scheduled_tokens].copy_( + inputs_embeds) + inputs_embeds = self.inputs_embeds[:num_input_tokens] + input_ids = None + else: + # For text-only models, we use token ids as input. + # While it is possible to use embeddings as input just like the + # multimodal models, it is not desirable for performance since + # then the embedding layer is not included in the ACL graph. + input_ids = self.input_ids[:num_input_tokens] + inputs_embeds = None + positions = self.positions[:num_input_tokens] + input_ids, positions = self._update_input_ids_and_positions( + input_ids, positions, num_input_tokens, with_prefill, + maybe_padded_num_tokens) + + if get_pp_group().is_first_rank: + intermediate_tensors = None + else: + assert intermediate_tensors is not None + assert self.intermediate_tensors is not None + for k, v in intermediate_tensors.items(): + self.intermediate_tensors[k][:num_input_tokens].copy_( + v[:num_input_tokens], non_blocking=True) + intermediate_tensors = IntermediateTensors({ + k: v[:num_input_tokens] + for k, v in self.intermediate_tensors.items() + }) + + use_spec_decode = len( + scheduler_output.scheduled_spec_decode_tokens) > 0 + if not use_spec_decode: + # NOTE(woosuk): Due to chunked prefills, the batch may contain + # partial requests. While we should not sample any token + # from these partial requests, we do so for simplicity. + # We will ignore the sampled tokens from the partial requests. + # TODO: Support prompt logprobs. + spec_decode_metadata = None + logits_indices = torch.from_numpy(cu_num_tokens - 1).to( + self.device, non_blocking=True) + else: + # Get the number of draft tokens for each request. + # Iterate over the dictionary rather than all requests since not all + # requests have draft tokens. + num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) + for req_id, draft_token_ids in ( + scheduler_output.scheduled_spec_decode_tokens.items()): + req_idx = self.input_batch.req_id_to_index[req_id] + num_draft_tokens[req_idx] = len(draft_token_ids) + + spec_decode_metadata = self._calc_spec_decode_metadata( + num_draft_tokens, cu_num_tokens) + logits_indices = spec_decode_metadata.logits_indices + + if lmhead_tp_enable(): + max_num_reqs_across_dp = maybe_padded_num_tokens if not with_prefill else self.max_num_reqs + logits_indices = nn.functional.pad( + logits_indices, + (0, max_num_reqs_across_dp - logits_indices.shape[0])) + + return (attn_metadata, positions, num_scheduled_tokens, + num_input_tokens, num_tokens_across_dp, + maybe_padded_num_tokens, logits_indices, spec_decode_metadata, + input_ids, inputs_embeds, intermediate_tensors) + + def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill, + maybe_padded_num_tokens, + input_ids, positions, + intermediate_tensors, + inputs_embeds): + assert self.model is not None + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def _build_attn_state(self, num_reqs, num_scheduled_tokens, + num_valid_tokens): + ascend_config = get_ascend_config() + if np.array_equal(self.seq_lens_np[:num_reqs], num_scheduled_tokens): + attn_state = AscendAttentionState.PrefillNoCache + # We assume it is the decode stage, where prefill occurs but only one token is not hit in cache. + elif np.all(num_scheduled_tokens == 1): + attn_state = AscendAttentionState.DecodeOnly + if self.speculative_config and self.speculative_config.method == 'deepseek_mtp': + # SpecDecoding now supports seq_len=1 and seq_len=2 + # In Prefilling Decoding Disaggregation scenario, SpecDecoding need to supports seq_len=1 + attn_state = AscendAttentionState.SpecDecoding + # Speculative decoding. + elif np.all(num_valid_tokens == 1): + if self.use_eagle: + attn_state = AscendAttentionState.ChunkedPrefill + else: + attn_state = AscendAttentionState.SpecDecoding + # splitfuse + elif not ascend_config.ascend_scheduler_config.enabled or self.chunked_prefill_enabled: + attn_state = AscendAttentionState.ChunkedPrefill + else: + attn_state = AscendAttentionState.PrefillCacheHit + return attn_state + + def _update_graph_pad_size(self, with_prefill, graph_pad_size): + self.graph_pad_size = -1 + + def _update_input_ids_and_positions(self, input_ids, positions, + num_input_tokens, with_prefill, + maybe_padded_num_tokens): + if self.uses_mrope: + positions = self.mrope_positions[:, :num_input_tokens] + return input_ids, positions + + def _get_cumsum_and_arange( + self, + num_tokens: np.ndarray, + cumsum_dtype: Optional[np.dtype] = None, + ) -> tuple[np.ndarray, np.ndarray]: + """Get the cumulative sum and batched arange of the given array. + # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_tokens]) + """ + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype) + total_num_tokens = cu_num_tokens[-1] + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_tokens] - cumsums_offsets + + return cu_num_tokens, arange + + def _calc_spec_decode_metadata( + self, + num_draft_tokens: np.ndarray, + cu_num_scheduled_tokens: np.ndarray, + ) -> SpecDecodeMetadata: + # Inputs: + # cu_num_scheduled_tokens: [ 4, 104, 107, 207, 209] + # num_draft_tokens: [ 3, 0, 2, 0, 1] + # Outputs: + # cu_num_draft_tokens: [ 3, 3, 5, 5, 6] + # logits_indices: [ 0, 1, 2, 3, 103, 104, 105, 106, + # 206, 207, 208] + # target_logits_indices: [ 0, 1, 2, 5, 6, 9] + # bonus_logits_indices: [ 3, 4, 7, 8, 10] + + # Compute the logits indices. + # [4, 1, 3, 1, 2] + num_sampled_tokens = num_draft_tokens + 1 + # Step 1. [4, 5, 8, 9, 11] + cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32) + total_num_sampled_tokens = cu_num_sampled_tokens[-1] + # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9] + cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens, + num_sampled_tokens) + # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] + arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets + # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] + logits_indices = np.repeat( + cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) + # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] + logits_indices += arange + + # Compute the bonus logits indices. + bonus_logits_indices = cu_num_sampled_tokens - 1 + + # Compute the draft logits indices. + # [3, 3, 5, 5, 6] + cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32) + total_num_draft_tokens = cu_num_draft_tokens[-1] + # [0, 0, 0, 3, 3, 5] + cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens, + num_draft_tokens) + # [0, 1, 2, 0, 1, 0] + arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets + # [0, 0, 0, 5, 5, 9] + target_logits_indices = np.repeat( + cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens) + # [0, 1, 2, 5, 6, 9] + target_logits_indices += arange + + # TODO: Optimize the CPU -> NPU copy. + cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to( + self.device, non_blocking=True) + logits_indices = torch.from_numpy(logits_indices).to(self.device, + non_blocking=True) + target_logits_indices = torch.from_numpy(target_logits_indices).to( + self.device, non_blocking=True) + bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to( + self.device, non_blocking=True) + + # Compute the draft token ids. + # draft_token_indices: [ 1, 2, 3, 105, 106, 208] + draft_token_ids = self.input_ids[logits_indices] + draft_token_ids = draft_token_ids[target_logits_indices + 1] + + metadata = SpecDecodeMetadata( + draft_token_ids=draft_token_ids, + num_draft_tokens=num_draft_tokens.tolist(), + cu_num_draft_tokens=cu_num_draft_tokens, + target_logits_indices=target_logits_indices, + bonus_logits_indices=bonus_logits_indices, + logits_indices=logits_indices, + ) + return metadata + + def apply_grammar_bitmask( + self, + scheduler_output: "SchedulerOutput", + logits: torch.Tensor, + ) -> torch.Tensor: + grammar_bitmask = scheduler_output.grammar_bitmask + + # We receive the structured output bitmask from the scheduler, + # compacted to contain bitmasks only for structured output requests. + # The order of the requests in the bitmask is not guaranteed to be the + # same as the order of the requests in the gpu runner's batch. We need + # to sort the bitmask to match the order of the requests used here. + + # Get the batch indices of the structured output requests. + # Keep track of the number of speculative tokens scheduled for every + # request in the batch, as the logit indices are offset by this amount. + struct_out_req_batch_indices: dict[str, int] = {} + cumulative_offset = 0 + seq = sorted(self.input_batch.req_id_to_index.items(), + key=lambda x: x[1]) + for req_id, batch_index in seq: + logit_index = batch_index + cumulative_offset + cumulative_offset += len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + if req_id in scheduler_output.structured_output_request_ids: + struct_out_req_batch_indices[req_id] = logit_index + + out_indices = [] + + # Reorder the bitmask to match the order of the requests in the batch. + sorted_bitmask = np.zeros_like(grammar_bitmask, + shape=(logits.shape[0], + grammar_bitmask.shape[1])) + cumulative_index = 0 + seq = sorted(scheduler_output.structured_output_request_ids.items(), + key=lambda x: x[1]) + for req_id, _ in seq: + logit_index = struct_out_req_batch_indices[req_id] + num_spec_tokens = len( + scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])) + for i in range(1 + num_spec_tokens): + sorted_bitmask[logit_index + i] = \ + grammar_bitmask[cumulative_index + i] + out_indices.append(logit_index + i) + cumulative_index += 1 + num_spec_tokens + grammar_bitmask = sorted_bitmask + + # Serialization of np.ndarray is much more efficient than a tensor, + # so we receive it in that format. + grammar_bitmask = torch.from_numpy(grammar_bitmask) + + # NOTE: + # 1. XGrammar bitmask applying only supports CPU and GPU. + # 2. The logits and bitmask should be on the same device. + # 3. XGrammar logits on CPU only supports float32 dtype. + logits_dtype = logits.dtype + logits = logits.to("cpu").float() + xgr.apply_token_bitmask_inplace( + logits, + grammar_bitmask, + indices=out_indices, + ) + return logits.to(self.device).to(logits_dtype) + + def propose_draft_token_ids( + self, + valid_sampled_token_ids: list[list[int]], + sampling_metadata: SamplingMetadata, + scheduler_output: "SchedulerOutput", + spec_decode_metadata: SpecDecodeMetadata, + positions: torch.Tensor, + num_scheduled_tokens: int, + hidden_states: torch.Tensor, + attn_metadata: Union[AscendMetadata, AscendMLAMetadata, + AscendTorchairMetadata, + AscendMLATorchairMetadata], + aux_hidden_states: torch.Tensor = None, + ) -> Optional[list[list[int]]]: + if not self.use_spec_decode: + # Speculative decoding is not enabled. + draft_token_ids = None + elif self.speculative_config.method == "ngram": + draft_token_ids = self._generate_ngram_token_ids( + valid_sampled_token_ids) + elif self.speculative_config.method == "eagle": + raise NotImplementedError("Eagle Is Not Supported Yet.") + elif self.speculative_config.method == "eagle3": + draft_token_ids = self._generate_eagle3_token_ids( + valid_sampled_token_ids, sampling_metadata, scheduler_output, + spec_decode_metadata, positions, num_scheduled_tokens, + hidden_states, aux_hidden_states) + elif self.speculative_config.method == 'deepseek_mtp': + draft_token_ids = self._generate_mtp_token_ids( + valid_sampled_token_ids, sampling_metadata, scheduler_output, + spec_decode_metadata, positions, num_scheduled_tokens, + hidden_states, attn_metadata) + return draft_token_ids + + def _pool_v010( + self, + hidden_states: torch.Tensor, + num_scheduled_tokens: int, + num_scheduled_tokens_np: np.ndarray, + finished_sending: Optional[set[str]] = None, + finished_recving: Optional[set[str]] = None, + kv_connector_output: Optional["KVConnectorOutput"] = None, + ) -> ModelRunnerOutput: + assert self.input_batch.num_reqs ==\ + len(self.input_batch.pooling_params), \ + "Either all or none of the requests in" \ + " a batch must be pooling request" + + extracted_hidden_states = list( + torch.split(hidden_states[:num_scheduled_tokens], + num_scheduled_tokens_np.tolist())) + + pooling_metadata = self.input_batch.pooling_metadata + + raw_pooler_output = self.model.pooler( + hidden_states=extracted_hidden_states, + pooling_metadata=pooling_metadata) + + pooler_output: list[Optional[torch.Tensor]] = [] + seq_lens = self.seq_lens[:self.input_batch.num_reqs] + for raw_output, seq_len, prompt_len in zip( + raw_pooler_output, seq_lens, pooling_metadata.prompt_lens): + + if seq_len == prompt_len: + pooler_output.append(raw_output.data.cpu()) + else: + pooler_output.append(None) + extra_args = ({"kv_connector_output": kv_connector_output}) + modelrunner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + **extra_args, + ) + return modelrunner_output + + def _pool( + self, + hidden_states: torch.Tensor, + num_scheduled_tokens: int, + num_scheduled_tokens_np: np.ndarray, + finished_sending: Optional[set[str]] = None, + finished_recving: Optional[set[str]] = None, + kv_connector_output: Optional["KVConnectorOutput"] = None, + ) -> ModelRunnerOutput: + assert self.input_batch.num_reqs ==\ + len(self.input_batch.pooling_params), \ + "Either all or none of the requests in" \ + " a batch must be pooling request" + + hidden_states = hidden_states[:num_scheduled_tokens] + pooling_metadata = self.input_batch.pooling_metadata + pooling_metadata.build_pooling_cursor(num_scheduled_tokens_np.tolist(), + device=hidden_states.device) + seq_lens_cpu = self.seq_lens_cpu[:self.input_batch.num_reqs] + + # Pooling models D2H & synchronize occurs in pooler.py:build_output + raw_pooler_output = self.model.pooler( + hidden_states=hidden_states, pooling_metadata=pooling_metadata) + + pooler_output: list[Optional[torch.Tensor]] = [] + for raw_output, seq_len, prompt_len in zip( + raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): + + if seq_len == prompt_len: + pooler_output.append(raw_output.data) + else: + pooler_output.append(None) + + return ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + kv_connector_output=kv_connector_output, + ) + + def _select_moe_comm_method(self, num_tokens: int) -> str: + soc_version = get_ascend_soc_version() + + if num_tokens <= self.mc2_tokens_capacity: + moe_comm_method = "mc2" + elif soc_version in {AscendSocVersion.A2}: + moe_comm_method = "allgather" + elif soc_version in {AscendSocVersion.A3}: + moe_comm_method = "alltoall" + else: + raise ValueError(f"Unsupported soc_version: {soc_version}") + + if is_global_first_rank(): + logger.debug(f"num_tokens: {num_tokens}, " + f"moe_comm_method: {moe_comm_method}") + + return moe_comm_method + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[ModelRunnerOutput, torch.Tensor]: + with ProfileExecuteDuration().capture_async("prepare input"): + self._update_states(scheduler_output) + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + logger.debug( + "skip this step for we receive the data from remote disaggregate prefill node" + ) + # Return empty ModelRunnerOuptut if there's no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward(scheduler_output) + (attn_metadata, positions, num_scheduled_tokens_np, + num_input_tokens, num_tokens_across_dp, maybe_padded_num_tokens, + logits_indices, spec_decode_metadata, input_ids, inputs_embeds, + intermediate_tensors) = (self._prepare_inputs( + scheduler_output, intermediate_tensors)) + + moe_comm_method = self._select_moe_comm_method(num_input_tokens) + + batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens, + uniform_decode=False) + aclgraph_runtime_mode, batch_descriptor = \ + self.aclgraph_dispatcher.dispatch(batch_descriptor) + + # Run forward pass + with ProfileExecuteDuration().capture_async("forward"): + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp, + with_prefill=self.with_prefill, + reserved_mc2_mask=self.reserved_mc2_mask, + moe_comm_method=moe_comm_method, + aclgraph_runtime_mode=aclgraph_runtime_mode, + batch_descriptor=batch_descriptor, + num_actual_tokens=scheduler_output. + total_num_scheduled_tokens): + self.maybe_setup_kv_connector(scheduler_output) + + hidden_states = self._generate_process_reqs_hidden_states( + attn_metadata, self.with_prefill, maybe_padded_num_tokens, + input_ids, positions, intermediate_tensors, inputs_embeds) + + self.maybe_wait_for_kv_save() + finished_sending, finished_recving = self.get_finished_kv_transfer( + scheduler_output) + + aux_hidden_states = None + if self.use_aux_hidden_state_outputs: + hidden_states, aux_hidden_states = hidden_states + + kv_connector_output = None + if finished_sending is not None or finished_recving is not None: + kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving) + else: + kv_connector_output = None + finished_sending = None + finished_recving = None + with ProfileExecuteDuration().capture_async("post process"): + # Broadcast PP output for external_launcher (torchrun) + # to make sure we are synced across pp ranks + # TODO: Support overlapping mirco-batches + # https://github.com/vllm-project/vllm/issues/18019 + broadcast_pp_output = \ + self.parallel_config.distributed_executor_backend \ + == "external_launcher" and len(get_pp_group().ranks) > 0 + if not get_pp_group().is_last_rank: + # For mid-pipeline stages, return the hidden states. + if not broadcast_pp_output: + hidden_states.kv_connector_output = kv_connector_output + return hidden_states + assert isinstance(hidden_states, IntermediateTensors) + get_pp_group().send_tensor_dict( + hidden_states.tensors, all_gather_group=get_tp_group()) + logits = None + else: + if self.input_batch.pooling_params: + if vllm_version_is("0.10.1.1") or vllm_version_is( + "0.10.1"): + return self._pool_v010( + hidden_states, + scheduler_output.total_num_scheduled_tokens, + num_scheduled_tokens_np, finished_sending, + finished_recving, kv_connector_output) + else: + return self._pool( + hidden_states, + scheduler_output.total_num_scheduled_tokens, + num_scheduled_tokens_np, finished_sending, + finished_recving, kv_connector_output) + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + if broadcast_pp_output: + model_output_broadcast_data = { + "logits": logits.contiguous(), + } if logits is not None else {} + model_output_broadcast_data = get_pp_group( + ).broadcast_tensor_dict(model_output_broadcast_data, + src=len(get_pp_group().ranks) - 1) + assert model_output_broadcast_data is not None + logits = model_output_broadcast_data["logits"] + + # Apply structured output bitmasks if present + if scheduler_output.grammar_bitmask is not None: + logits = self.apply_grammar_bitmask(scheduler_output, logits) + + # Sample the next token and get logprobs if needed. + sampling_metadata = self.input_batch.sampling_metadata + if spec_decode_metadata is None: + if lmhead_tp_enable() and logits is not None: + logits = logits[:self.input_batch.num_reqs] + sampler_output = self.sampler( + logits=logits, + sampling_metadata=sampling_metadata, + ) + else: + if lmhead_tp_enable() and logits is not None: + logits = logits[:len(spec_decode_metadata.logits_indices)] + # When indexing with a tensor (bonus_logits_indices), PyTorch + # creates a new tensor with separate storage from the original + # logits tensor. This means any in-place operations on bonus_logits + # won't affect the original logits tensor. + assert logits is not None + bonus_logits = logits[ + spec_decode_metadata.bonus_logits_indices] + sampler_output = self.sampler( + logits=bonus_logits, + sampling_metadata=sampling_metadata, + ) + bonus_token_ids = sampler_output.sampled_token_ids + + # Just like `bonus_logits`, `target_logits` is a new tensor with + # separate storage from the original `logits` tensor. Therefore, + # it is safe to update `target_logits` in place. + target_logits = logits[ + spec_decode_metadata.target_logits_indices] + output_token_ids = self.rejection_sampler( + spec_decode_metadata, + None, # draft_probs + target_logits, + bonus_token_ids, + sampling_metadata, + ) + sampler_output.sampled_token_ids = output_token_ids + + discard_sampled_tokens_req_indices: list[int] = [] + # TODO(woosuk): The following loop can be slow since it iterates over + # the requests one by one. Optimize. + discard_sampled_tokens_req_indices = [] + for i, req_id in enumerate(self.input_batch.req_ids): + req_state = self.requests[req_id] + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + if seq_len < req_state.num_tokens: + # Ignore the sampled token. + # Rewind the generator state as if the token was not sampled. + generator = self.input_batch.generators.get(i) + if generator is not None: + generator.set_offset(generator.get_offset() - 4) + discard_sampled_tokens_req_indices.append(i) + + # NOTE: NPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + logprobs_tensors = sampler_output.logprobs_tensors + logprobs_lists = logprobs_tensors.tolists() \ + if logprobs_tensors is not None else None + + # Compute prompt logprobs if needed. + prompt_logprobs_dict = self._get_prompt_logprobs_dict( + hidden_states[:scheduler_output.total_num_scheduled_tokens], + scheduler_output, + ) + + # Get the valid generated tokens. + sampled_token_ids = sampler_output.sampled_token_ids + max_gen_len = sampled_token_ids.shape[-1] + if max_gen_len == 1: + # No spec decode tokens. + valid_sampled_token_ids = sampled_token_ids.tolist() + else: + # Includes spec decode tokens. + valid_sampled_token_ids = self.rejection_sampler.parse_output( + sampled_token_ids, + self.input_batch.vocab_size, + ) + + for i in discard_sampled_tokens_req_indices: + valid_sampled_token_ids[i].clear() + # Cache the sampled tokens in the model runner, so that the schedulerAdd commentMore actions + # doesn't need to send them back. + # NOTE(woosuk): As an exception, when using PP, the scheduler sends + # the sampled tokens back, because there's no direct communication + # between the first-stage worker and the last-stage worker. + for req_idx, sampled_ids in enumerate(valid_sampled_token_ids): + if not sampled_ids: + continue + + start_idx = self.input_batch.num_tokens_no_spec[req_idx] + end_idx = start_idx + len(sampled_ids) + assert end_idx <= self.model_config.max_model_len, ( + "Sampled token IDs exceed the max model length. " + f"Total number of tokens: {end_idx} > max_model_len: " + f"{self.model_config.max_model_len}") + + self.input_batch.token_ids_cpu[req_idx, + start_idx:end_idx] = sampled_ids + self.input_batch.num_tokens_no_spec[req_idx] = end_idx + self.input_batch.num_tokens[req_idx] = end_idx + req_id = self.input_batch.req_ids[req_idx] + req_state = self.requests[req_id] + req_state.output_token_ids.extend(sampled_ids) + + if self.speculative_config: + self._draft_token_ids = self.propose_draft_token_ids( + valid_sampled_token_ids, + sampling_metadata, + scheduler_output, + spec_decode_metadata, + positions, + scheduler_output.total_num_scheduled_tokens, + hidden_states, + attn_metadata, + aux_hidden_states, + ) + + if has_kv_transfer_group(): + get_kv_transfer_group().clear_connector_metadata() + + extra_args = ({"kv_connector_output": kv_connector_output}) + + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + spec_token_ids=self._draft_token_ids, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) + else: + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=valid_sampled_token_ids, + logprobs=logprobs_lists, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + **extra_args, + ) + + durations = ProfileExecuteDuration().pop_captured_sync() + if durations: + dr_str = [ + f"[{tag}]:{duration:.2f}ms" + for tag, duration in durations.items() + ] + captured_name = "Decode" if self.attn_state == AscendAttentionState.DecodeOnly else "Prefill" + logger.info("Profile execute duration [%s]:%s", captured_name, + " ".join(dr_str)) + + return model_runner_output + + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: + if self._draft_token_ids is None: + return None + req_ids = self.input_batch.req_ids + if isinstance(self._draft_token_ids, torch.Tensor): + draft_token_ids = self._draft_token_ids.tolist() + else: + draft_token_ids = self._draft_token_ids + self._draft_token_ids = None + return DraftTokenIds(req_ids, draft_token_ids) + + def kv_connector_no_forward( + self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput: + with set_ascend_forward_context(None, self.vllm_config): + self.maybe_setup_kv_connector(scheduler_output) + finished_sending, finished_recving = ( + self.get_finished_kv_transfer(scheduler_output)) + # For the case of no forward caused by receiving remote kv, + # one round of dummy inference is necessary + # to prevent hang over the collective calls. + if not finished_sending and not finished_recving: + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving) + return output + + @staticmethod + def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"): + # Update KVConnector with the KVConnector metadata forward(). + if has_kv_transfer_group(): + kv_connector = get_kv_transfer_group() + assert isinstance(kv_connector, KVConnectorBase_V1) + assert scheduler_output.kv_connector_metadata is not None + kv_connector.bind_connector_metadata( + scheduler_output.kv_connector_metadata) + + kv_connector.start_load_kv(get_forward_context()) + + @staticmethod + def maybe_wait_for_kv_save() -> None: + if has_kv_transfer_group(): + get_kv_transfer_group().wait_for_save() + + @staticmethod + def get_finished_kv_transfer( + scheduler_output: "SchedulerOutput", + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_finished( + scheduler_output.finished_req_ids) + return None, None + + def _build_attention_metadata(self, with_prefill, num_reqs, skip_attn): + if skip_attn: + attn_metadata = None + else: + # TODO(zzzzwwjj): when aclgraph and full graph mode, we need build attn_metadata + attn_metadata = None + return attn_metadata + + def _generate_dummy_run_hidden_states(self, with_prefill, + is_torchair_compile, input_ids, + positions, attn_metadata, num_tokens, + intermediate_tensors, inputs_embeds): + hidden_states = self.model(input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) + if self.use_aux_hidden_state_outputs: + hidden_states, _ = hidden_states + else: + hidden_states = hidden_states + if self.use_spec_decode and isinstance(self.drafter, EagleProposer): + self.drafter.dummy_run(num_tokens) + return hidden_states + + @torch.inference_mode() + def _dummy_run( + self, + num_tokens: int, + with_prefill: bool = False, + is_torchair_compile: bool = False, + aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, + force_attention: bool = False, + uniform_decode: bool = False, + ) -> torch.Tensor: + # only support eager mode and piecewise graph now + assert aclgraph_runtime_mode in { + CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE + } + if force_attention: + raise RuntimeError( + "Capturing attention in aclgraph is unexpected, because full graph is not supported now" + ) + + # Padding for DP + (num_tokens, num_tokens_across_dp, with_prefill, + _) = self._sync_metadata_across_dp(num_tokens, with_prefill, False) + + moe_comm_method = self._select_moe_comm_method(num_tokens) + + # If cudagraph_mode.decode_mode() == FULL and + # cudagraph_mode.seperate_routine(). This means that we are using + # different graphs and/or modes for mixed prefill-decode batches vs. + # uniform decode batches. A uniform decode batch means that all + # requests have identical query length, except a potential virtual + # request (shorter) in the batch account for padding. + # Uniform decode batch could either be common pure decode, where + # max_query_len == 1, or speculative decode, where + # max_query_len == 1 + num_spec_decode_tokens. + + # When setting max_query_len = 1, we switch to and capture the optimized + # routine of FA2 for pure decode, i.e., Flashdecode + an optimization + # for GQA/MQA. + max_query_len = self.uniform_decode_query_len if uniform_decode else \ + num_tokens + + max_num_reqs = self.scheduler_config.max_num_seqs + # Set num_scheduled_tokens based on num_tokens and max_num_seqs + # for dummy run with LoRA so that the num_reqs collectively + # has num_tokens in total. + assert num_tokens <= self.scheduler_config.max_num_batched_tokens + max_num_reqs = self.scheduler_config.max_num_seqs + if uniform_decode: + num_reqs = cdiv(num_tokens, max_query_len) + assert num_reqs <= max_num_reqs, \ + "Do not capture num_reqs > max_num_reqs for uniform batch" + num_scheduled_tokens_list = [max_query_len] * num_reqs + if num_tokens % max_query_len != 0: + num_scheduled_tokens_list[-1] = num_tokens % max_query_len + else: + if with_prefill: + num_reqs = num_tokens + else: + num_reqs = (num_tokens + self.decode_token_per_req - + 1) // self.decode_token_per_req + num_reqs = min(num_reqs, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + assert sum(num_scheduled_tokens_list) == num_tokens + assert len(num_scheduled_tokens_list) == num_reqs + num_scheduled_tokens = np.array(num_scheduled_tokens_list, + dtype=np.int32) + + # Force dummy run on prefill stage when this node is deemed as kv producer. + if self.is_kv_producer: + with_prefill = True + + attn_metadata = self._build_attention_metadata(with_prefill, + num_reqs, + skip_attn=True) + + with self.maybe_dummy_run_with_lora(self.lora_config, + num_scheduled_tokens): + if self.is_multimodal_model: + input_ids = None + inputs_embeds = self.inputs_embeds[:num_tokens] + else: + input_ids = self.input_ids[:num_tokens] + inputs_embeds = None + + if self.uses_mrope: + positions = self.mrope_positions[:, :num_tokens] + else: + positions = self.positions[:num_tokens] + + if get_pp_group().is_first_rank: + intermediate_tensors = None + else: + if self.intermediate_tensors is None: + self.intermediate_tensors = ( + self.model.make_empty_intermediate_tensors( + batch_size=num_tokens, + dtype=self.dtype, + device=self.device)) + intermediate_tensors = IntermediateTensors({ + k: v[:num_tokens] + for k, v in self.intermediate_tensors.items() + }) + if aclgraph_runtime_mode == CUDAGraphMode.NONE: + batch_descriptor = None + else: + # filter out the valid batch descriptor + _cg_mode, batch_descriptor = \ + self.aclgraph_dispatcher.dispatch( + BatchDescriptor(num_tokens=num_tokens, + uniform_decode=uniform_decode)) + # sanity check + assert aclgraph_runtime_mode == _cg_mode, ( + f"Aclgraph runtime mode mismatch at dummy_run. " + f"Expected {_cg_mode}, but got {aclgraph_runtime_mode}.") + + need_dummy_logits = (not self.in_profile_run + and lmhead_tp_enable()) + + if need_dummy_logits: + max_num_reqs_across_dp = num_tokens if not with_prefill else max_num_reqs + dummy_indices = torch.zeros(max_num_reqs_across_dp, + dtype=torch.int32) + + def dummy_compute_logits(hidden_states): + return self.model.compute_logits( + hidden_states[dummy_indices], None) + + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + with_prefill=with_prefill, + in_profile_run=self.in_profile_run, + reserved_mc2_mask=self.reserved_mc2_mask, + moe_comm_method=moe_comm_method, + num_actual_tokens=0, + aclgraph_runtime_mode=aclgraph_runtime_mode, + batch_descriptor=batch_descriptor): + hidden_states = self._generate_dummy_run_hidden_states( + with_prefill, is_torchair_compile, input_ids, positions, + attn_metadata, num_tokens, intermediate_tensors, + inputs_embeds) + if need_dummy_logits: + dummy_compute_logits(hidden_states) + + if self.speculative_config and self.speculative_config.method == "deepseek_mtp": + assert isinstance(self.drafter, MtpProposer) + self.drafter.dummy_run( + num_tokens=num_tokens, + with_prefill=with_prefill, + skip_attn=True, + num_reqs=num_reqs, + num_tokens_across_dp=num_tokens_across_dp) + if need_dummy_logits: + dummy_compute_logits(hidden_states) + return hidden_states + + @contextmanager + def set_in_profile_run(self): + self.in_profile_run = True + try: + yield + finally: + self.in_profile_run = False + + def profile_run(self) -> None: + # Trigger compilation for general shape. + with self.set_in_profile_run(): + hidden_states = self._dummy_run(self.max_num_tokens, + with_prefill=True) + output = None + if get_pp_group().is_last_rank: + if self.is_pooling_model: + output = self._dummy_pooler_run(hidden_states) + else: + # For profile, have maximum num_reqs and that collectively have + # maximum num_tokens. + min_tokens_per_req = self.max_num_tokens // self.max_num_reqs + num_scheduled_tokens_list = [min_tokens_per_req + ] * self.max_num_reqs + num_scheduled_tokens_list[ + -1] += self.max_num_tokens % self.max_num_reqs + num_scheduled_tokens = np.array(num_scheduled_tokens_list, + dtype=np.int32) + logit_indices = np.cumsum(num_scheduled_tokens) - 1 + # TODO: need to rum a dummy sampler for generate task + hidden_states = hidden_states[logit_indices] + output = self.model.compute_logits(hidden_states, None) + + NPUPlatform.synchronize() + del hidden_states, output + self.encoder_cache.clear() + gc.collect() + + def _dummy_pooler_run_task( + self, + hidden_states: torch.Tensor, + task: PoolingTask, + ) -> PoolerOutput: + num_tokens = hidden_states.shape[0] + max_num_reqs = self.scheduler_config.max_num_seqs + num_reqs = min(num_tokens, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + assert sum(num_scheduled_tokens_list) == num_tokens + assert len(num_scheduled_tokens_list) == num_reqs + + hidden_states_list = list( + torch.split(hidden_states, num_scheduled_tokens_list)) + req_num_tokens = num_tokens // num_reqs + + dummy_token_ids = torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device) + + model = cast(VllmModelForPooling, self.get_model()) + dummy_pooling_params = PoolingParams(task=task) + to_update = model.pooler.get_pooling_updates(task) + to_update.apply(dummy_pooling_params) + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + dummy_prompt_lens = torch.tensor( + [h.shape[0] for h in hidden_states_list], + device=self.device, + ) + dummy_metadata = PoolingMetadata( + prompt_lens=dummy_prompt_lens, + prompt_token_ids=dummy_token_ids, + pooling_params=[dummy_pooling_params] * num_reqs, + ) + + try: + return model.pooler(hidden_states=hidden_states_list, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "NPU out of memory occurred when warming up pooler " + f"({task=}) with {num_reqs} dummy requests. Please try " + "lowering `max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + else: + dummy_prompt_lens = torch.tensor( + num_scheduled_tokens_list, + device="cpu", + ) + dummy_metadata = PoolingMetadata( + prompt_lens=dummy_prompt_lens, + prompt_token_ids=dummy_token_ids, + pooling_params=[dummy_pooling_params] * num_reqs, + ) + + dummy_metadata.build_pooling_cursor(num_scheduled_tokens_list, + device=hidden_states.device) + + try: + return model.pooler(hidden_states=hidden_states, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up pooler " + f"({task=}) with {num_reqs} dummy requests. Please try " + "lowering `max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + + @torch.inference_mode() + def _dummy_pooler_run( + self, + hidden_states: torch.Tensor, + ) -> PoolerOutput: + # Find the task that has the largest output for subsequent steps + output_size = dict[PoolingTask, float]() + for task in self.get_supported_pooling_tasks(): + # Run a full batch with each task to ensure none of them OOMs + output = self._dummy_pooler_run_task(hidden_states, task) + output_size[task] = output.get_data_nbytes() + del output # Allow GC + + max_task = max(output_size.items(), key=lambda x: x[1])[0] + return self._dummy_pooler_run_task(hidden_states, max_task) + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + + with DeviceMemoryProfiler() as m: # noqa: SIM117 + self.model = get_model(vllm_config=self.vllm_config) + + if is_310p(): + from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, QKVParallelLinear, + RowParallelLinear) + for module in self.model.modules(): + if isinstance(module, + (MergedColumnParallelLinear, + QKVParallelLinear, RowParallelLinear)): + module.weight.data = self._convert_torch_format( + module.weight.data) + if self.drafter: + logger.info("Loading drafter model...") + if isinstance(self.drafter, EagleProposer): + if self.use_aux_hidden_state_outputs: + self.drafter.load_model(self.model) + self.model.set_aux_hidden_state_layers( + self.model.get_eagle3_aux_hidden_state_layers()) + else: + self.drafter.load_model() + if self.lora_config: + self.model = self.load_lora_model(self.model, + self.model_config, + self.scheduler_config, + self.lora_config, + self.device) + logger.info("Loading model weights took %.4f GB", + m.consumed_memory / float(2**30)) + + def _convert_torch_format(self, tensor): + tensor = torch_npu.npu_format_cast(tensor, ACL_FORMAT) + return tensor + + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + self.kv_cache_config = kv_cache_config + kv_caches: Dict[str, torch.Tensor] = {} + + def align_memory(tensor: torch.Tensor, alignment: int) -> torch.Tensor: + data_ptr = tensor.data_ptr() + aligned_addr = (data_ptr + alignment - 1) // alignment * alignment + offset = (aligned_addr - data_ptr) // tensor.element_size() + return tensor[int(offset):] + + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.model_config.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=[self.block_size], + is_spec_decode=bool(self.vllm_config.speculative_config), + logitsprocs=build_logitsprocs( + self.vllm_config, self.device, self.pin_memory, + self.is_pooling_model, + self.vllm_config.model_config.logits_processors), + is_pooling_model=self.is_pooling_model, + ) + + kv_cache_sizes = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + assert len(kv_cache_tensor.shared_by) == 1, ( + "KV cache tensor shared by multiple layers is not supported in " + "NPU.") + kv_cache_sizes[kv_cache_tensor.shared_by[0]] = kv_cache_tensor.size + + for kv_cache_group in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group.kv_cache_spec + for layer_name in kv_cache_group.layer_names: + tensor_size = kv_cache_sizes[layer_name] + assert tensor_size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_size // kv_cache_spec.page_size_bytes + + # `num_blocks` is the number of blocks the model runner can use. + # `kv_cache_config.num_blocks` is the number of blocks that + # KVCacheManager may allocate. + # Since different GPUs may have different number of layers and + # different memory capacities, `num_blocks` can be different on + # different GPUs, and `kv_cache_config.num_blocks` is set to + # the min of all `num_blocks`. Verify it here. + assert num_blocks >= kv_cache_config.num_blocks + alignment = 2 * 1024 * 1024 + # TODO: remove this after the OOM issue is located and fixed, otherwise, some model may + # encounter OOM issue + if isinstance(kv_cache_spec, FullAttentionSpec): + if self.vllm_config.additional_config.get( + "kv_cache_dtype", None) == 'int8': + kv_cache_shape = self.attn_backend.get_bsh_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size) + else: + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size) + dtype = kv_cache_spec.dtype + if self.model_config.is_deepseek_mla: + num_blocks, block_size, num_kv_heads, head_size = kv_cache_shape + rope_dim = self.model_config.hf_text_config.qk_rope_head_dim + nope_dim = head_size - rope_dim + nope_cache_shape = (num_blocks, block_size, + num_kv_heads, nope_dim) + rope_cache_shape = (num_blocks, block_size, + num_kv_heads, rope_dim) + if self.vllm_config.kv_transfer_config is None: + # For no disaggregate pd scenario, allocate kv cache in normal way + rope_cache = torch.zeros(rope_cache_shape, + dtype=dtype, + device=self.device) + nope_cache = torch.zeros(nope_cache_shape, + dtype=dtype, + device=self.device) + rope_cache = self._convert_torch_format(rope_cache) + nope_cache = self._convert_torch_format(nope_cache) + else: + + # In order to transfer kv cache through the reigster_memory api from llmdatadist, the memory + # address should be aligned by 2M. In most case, torch_npu can allocate 2M aligned memory, but + # we found there are also some exceptions during test, so we manual align those memory here, this part + # of code may consume 2M * 2 * elem_size memory every layer. + nope_allocate_shape = num_blocks * block_size * num_kv_heads * nope_dim + nope_allocate_shape_alignment = nope_allocate_shape + alignment + rope_allocate_shape = num_blocks * block_size * num_kv_heads * rope_dim + rope_allocate_shape_alignment = rope_allocate_shape + alignment + + nope_cache = torch.zeros( + nope_allocate_shape_alignment, + dtype=dtype, + device=self.device) + rope_cache = torch.zeros( + rope_allocate_shape_alignment, + dtype=dtype, + device=self.device) + nope_cache = align_memory( + nope_cache, + alignment)[:nope_allocate_shape].view( + nope_cache_shape) + rope_cache = align_memory( + rope_cache, + alignment)[:rope_allocate_shape].view( + rope_cache_shape) + kv_caches[layer_name] = (nope_cache, rope_cache) + else: + num_caches = kv_cache_shape[0] + kv_cache_list = [] + for i in range(num_caches): + cache_shape = kv_cache_shape[1:] + if self.vllm_config.kv_transfer_config is None: + kv_cache = torch.zeros(cache_shape, + dtype=dtype, + device=self.device) + kv_cache = self._convert_torch_format(kv_cache) + else: + cache_size = math.prod(cache_shape) + cache_size_aligned = cache_size + alignment + kv_cache = torch.zeros(cache_size_aligned, + dtype=dtype, + device=self.device) + kv_cache = align_memory( + kv_cache, + alignment)[:cache_size].view(cache_shape) + kv_cache_list.append(kv_cache) + kv_caches[layer_name] = tuple(kv_cache_list) + else: + # TODO: add new branches when introducing more types of + # KV cache specs. + raise ValueError("Unknown KV cache spec type.") + + bind_kv_cache(kv_caches, + self.compilation_config.static_forward_context, + self.kv_caches) + + if has_kv_transfer_group(): + get_kv_transfer_group().register_kv_caches(kv_caches) + + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: + """ + Generates the KVCacheSpec by parsing the kv cache format from each + Attention module in the static forward context. + Returns: + KVCacheSpec: A dictionary mapping layer names to their KV cache + format. Layers that do not need KV cache are not included. + """ + + forward_ctx = self.compilation_config.static_forward_context + use_mla = self.vllm_config.model_config.use_mla + kv_cache_spec: dict[str, KVCacheSpec] = {} + for layer_name, attn_module in forward_ctx.items(): + if isinstance(attn_module, FusedMoE): + continue + + # TODO: Support other attention modules, e.g., sliding window, + # cross-attention + assert isinstance(attn_module, Attention) + if attn_module.attn_type == AttentionType.DECODER: + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=self.block_size, + num_kv_heads=attn_module.num_kv_heads, + head_size=attn_module.head_size, + dtype=self.kv_cache_dtype, + use_mla=use_mla) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. + continue + elif attn_module.attn_type == AttentionType.ENCODER_DECODER: + raise NotImplementedError + else: + raise ValueError( + f"Unknown attention type: {attn_module.attn_type}") + + return kv_cache_spec + + def initialize_aclgraph_capture(self) -> None: + # TODO: Add check of AttentionCGSupport and cudagraph_mode.decode_mode when full graph is supported + # Trigger aclgraph dispatching keys initialization here (after + # initializing attn backends). + self.aclgraph_dispatcher.initialize_cudagraph_keys( + self.compilation_config.cudagraph_mode, + self.uniform_decode_query_len) + + def _capture_aclgraphs(self, compilation_cases: list[int], + aclgraph_runtime_mode: CUDAGraphMode, + uniform_decode: bool): + assert aclgraph_runtime_mode != CUDAGraphMode.NONE and \ + aclgraph_runtime_mode in [CUDAGraphMode.PIECEWISE] + + # Only rank 0 should print progress bar during capture + if is_global_first_rank(): + compilation_cases = tqdm( + compilation_cases, + disable=not self.load_config.use_tqdm_on_load, + desc="Capturing ACL graphs ({}, {})".format( + "decode" if uniform_decode else "mixed prefill-decode", + aclgraph_runtime_mode.name)) + # We skip EPLB here since we don't want to record dummy metrics + for num_tokens in compilation_cases: + for _ in range(self.compilation_config.cudagraph_num_of_warmups): + # Use CUDAGraphRuntimeStyle.NONE (default) for warmup. + # But be careful, warm up with `NONE`is orthogonal to + # if we want to warm up attention or not. This is + # different from the case where `FULL` implies capture + # attention while `PIECEWISE` implies no attention. + force_attention = (aclgraph_runtime_mode == CUDAGraphMode.FULL) + self._dummy_run(num_tokens, + aclgraph_runtime_mode=CUDAGraphMode.NONE, + force_attention=force_attention, + uniform_decode=uniform_decode) + self._dummy_run(num_tokens, + aclgraph_runtime_mode=aclgraph_runtime_mode, + uniform_decode=uniform_decode) + + def _capture_model(self): + if not self.use_aclgraph: + logger.warning( + "Skipping ACL graph capture. To turn on ACL graph capture, " + "ensure `aclraph_mode` was not manually set to `NONE`") + return + else: + self.initialize_aclgraph_capture() + + set_cudagraph_capturing_enabled(True) + # Trigger ACL graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + with graph_capture(device=self.device): + aclgraph_mode = self.compilation_config.cudagraph_mode + if aclgraph_mode.mixed_mode() != CUDAGraphMode.NONE: + aclgraph_runtime_mode = aclgraph_mode.mixed_mode() + + compilation_cases = list(reversed(self.aclgraph_batch_sizes)) + self._capture_aclgraphs( + compilation_cases, + aclgraph_runtime_mode=aclgraph_runtime_mode, + uniform_decode=False) + + # Disable aclgraph capturing globally, so any unexpected aclgraph + # capturing will be detected and raise an error after here. + # Note: We don't put it into graph_capture context manager because + # we may doing lazy capturing in future that still allows capturing + # after here. + set_cudagraph_capturing_enabled(False) + + def capture_model(self) -> None: + + compilation_counter.num_gpu_runner_capture_triggers += 1 + + start_time = time.perf_counter() + start_free_npu_memory = torch.npu.mem_get_info()[0] + + self._capture_model() + + end_time = time.perf_counter() + end_free_npu_memory = torch.npu.mem_get_info()[0] + elapsed_time = end_time - start_time + npu_graph_size = start_free_npu_memory - end_free_npu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, npu_graph_size / (1 << 30)) + + def _generate_ngram_token_ids( + self, + sampled_token_ids: list[list[int]], + ) -> list[list[int]]: + # TODO(woosuk): Optimize. + draft_token_ids: list[list[int]] = [] + for i, sampled_ids in enumerate(sampled_token_ids): + num_sampled_ids = len(sampled_ids) + if not num_sampled_ids: + # Skip speculative decoding. + draft_token_ids.append([]) + continue + + # Skip requests that require top-p, top-k, etc. + req_id = self.input_batch.req_ids[i] + if req_id in self.input_batch.spec_decode_unsupported_reqs: + draft_token_ids.append([]) + continue + + # Add sampled_token_ids to token_ids_cpu. + start_idx = self.input_batch.num_tokens_no_spec[i] + end_idx = start_idx + num_sampled_ids + self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids + assert isinstance(self.drafter, NgramProposer) + drafter_output = self.drafter.propose( + self.input_batch.token_ids_cpu[i, :end_idx]) + if drafter_output is None or len(drafter_output) == 0: + draft_token_ids.append([]) + else: + draft_token_ids.append(drafter_output.tolist()) + return draft_token_ids + + def _generate_eagle3_token_ids(self, + valid_sampled_token_ids: list[list[int]], + sampling_metadata: SamplingMetadata, + scheduler_output: "SchedulerOutput", + spec_decode_metadata: SpecDecodeMetadata, + positions: torch.Tensor, + num_scheduled_tokens: int, + hidden_states: torch.Tensor, + aux_hidden_states: torch.Tensor = None): + assert isinstance(self.drafter, EagleProposer) + attn_metadata = self.get_eagle_atten_dict(scheduler_output) + next_token_ids: list[int] = [] + for i, token_ids in enumerate(valid_sampled_token_ids): + if token_ids: + # Common case. + next_token_id = token_ids[-1] + else: + # Partial prefill (rare case). + # Get the next token id from the request state. + req_id = self.input_batch.req_ids[i] + req_state = self.requests[req_id] + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + + next_token_id = req_state.get_token_id(seq_len) + next_token_ids.append(next_token_id) + next_token_ids = torch.tensor(next_token_ids, + dtype=torch.int32, + device=self.device) + eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name] + if spec_decode_metadata is None: + # input_ids can be None for multimodal models. + target_token_ids = self.input_ids[:num_scheduled_tokens] + target_positions = positions[:num_scheduled_tokens] + if self.use_aux_hidden_state_outputs: + target_hidden_states = torch.cat( + [h[:num_scheduled_tokens] for h in aux_hidden_states], + dim=-1) + else: + target_hidden_states = hidden_states[:num_scheduled_tokens] + target_slot_mapping = eagle_attn_metadata.slot_mapping + cu_num_tokens = eagle_attn_metadata.query_start_loc + else: + num_draft_tokens = spec_decode_metadata.num_draft_tokens + num_rejected_tokens = [ + n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0 + for i, n in enumerate(num_draft_tokens) + ] + num_rejected_tokens = torch.tensor( + num_rejected_tokens, + dtype=torch.int32, + device=self.device, + ) + num_tokens = num_scheduled_tokens - sum(num_rejected_tokens) + cu_num_tokens, token_indices = self.drafter.prepare_inputs( + eagle_attn_metadata.query_start_loc, num_rejected_tokens, + num_tokens) + target_token_ids = self.input_ids[token_indices] + target_positions = positions[token_indices] + if self.use_aux_hidden_state_outputs: + target_hidden_states = torch.cat( + [h[token_indices] for h in aux_hidden_states], dim=-1) + else: + target_hidden_states = hidden_states[token_indices] + target_slot_mapping = eagle_attn_metadata.slot_mapping[ + token_indices] + + draft_token_ids = self.drafter.propose( + target_token_ids=target_token_ids, + target_positions=target_positions, + target_hidden_states=target_hidden_states, + target_slot_mapping=target_slot_mapping, + next_token_ids=next_token_ids, + cu_num_tokens=cu_num_tokens, + block_table=eagle_attn_metadata.block_tables, + sampling_metadata=sampling_metadata, + ) + spec_token_ids = draft_token_ids.tolist() + return spec_token_ids + + def _generate_mtp_token_ids( + self, + valid_sampled_token_ids: list[list[int]], + sampling_metadata: SamplingMetadata, + scheduler_output: "SchedulerOutput", + spec_decode_metadata: SpecDecodeMetadata, + positions: torch.Tensor, + num_scheduled_tokens: int, + hidden_states: torch.Tensor, + attn_metadata: Union[AscendMetadata, AscendMLAMetadata, + AscendTorchairMetadata, + AscendMLATorchairMetadata], + ): + assert isinstance(self.drafter, MtpProposer) + next_token_ids: list[int] = [] + for i, token_ids in enumerate(valid_sampled_token_ids): + if token_ids: + # Common case. + next_token_id = token_ids[-1] + else: + # Partial prefill (rare case). + # Get the next token id from the request state. + req_id = self.input_batch.req_ids[i] + req_state = self.requests[req_id] + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + next_token_id = req_state.get_token_id(seq_len) + next_token_ids.append(next_token_id) + next_token_ids = torch.tensor(next_token_ids, + dtype=torch.int32, + device=self.device) + accepted_token_indices = None + if spec_decode_metadata is None: + # input_ids can be None for multimodal models. + target_token_ids = self.input_ids[:num_scheduled_tokens] + target_positions = positions[:num_scheduled_tokens] + target_hidden_states = hidden_states[:num_scheduled_tokens] + target_slot_mapping = attn_metadata.slot_mapping + cu_num_tokens = attn_metadata.query_start_loc + else: + # TODO(woosuk): Refactor this. + num_draft_tokens = spec_decode_metadata.num_draft_tokens + num_rejected_tokens = [ + n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0 + for i, n in enumerate(num_draft_tokens) + ] + num_rejected_tokens = torch.tensor( + num_rejected_tokens, + dtype=torch.int32, + device=self.device, + ) + cu_num_tokens, accepted_token_indices, target_token_ids, \ + target_positions, target_hidden_states, target_slot_mapping = self.drafter.prepare_inputs( + attn_metadata.query_start_loc, + num_rejected_tokens, + self.input_ids[:num_scheduled_tokens], + positions[:num_scheduled_tokens], + hidden_states[:num_scheduled_tokens], + attn_metadata.slot_mapping[:num_scheduled_tokens], + is_torchair_graph=self._build_drafter_prepare_inputs_torchair_param(), + ) + + draft_token_ids = self.drafter.propose( + target_token_ids=target_token_ids, + target_positions=target_positions, + target_hidden_states=target_hidden_states, + target_slot_mapping=target_slot_mapping, + next_token_ids=next_token_ids, + cu_num_tokens=cu_num_tokens, + block_table=attn_metadata.block_tables, + sampling_metadata=sampling_metadata, + token_indices=accepted_token_indices) + spec_token_ids = draft_token_ids.tolist() + return spec_token_ids + + def _get_prompt_logprobs_dict( + self, + hidden_states: torch.Tensor, + scheduler_output: "SchedulerOutput", + ) -> dict[str, Optional[LogprobsTensors]]: + num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs + if not num_prompt_logprobs_dict: + return {} + + in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu + prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {} + + # Since prompt logprobs are a rare feature, prioritize simple, + # maintainable loop over optimal performance. + completed_prefill_reqs = [] + for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items(): + + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + + # Get metadata for this request. + request = self.requests[req_id] + num_prompt_tokens = len(request.prompt_token_ids) + prompt_token_ids = torch.tensor(request.prompt_token_ids).to( + self.device, non_blocking=True) + + # Set up target LogprobsTensors object. + logprobs_tensors = in_progress_dict.get(req_id) + if not logprobs_tensors: + # Create empty logprobs CPU tensors for the entire prompt. + # If chunked, we'll copy in slice by slice. + logprobs_tensors = LogprobsTensors.empty_cpu( + num_prompt_tokens - 1, num_prompt_logprobs + 1) + in_progress_dict[req_id] = logprobs_tensors + + # Determine number of logits to retrieve. + start_idx = request.num_computed_tokens + start_tok = start_idx + 1 + num_remaining_tokens = num_prompt_tokens - start_tok + if num_tokens <= num_remaining_tokens: + # This is a chunk, more tokens remain. + # In the == case, there are no more prompt logprobs to produce + # but we want to defer returning them to the next step where we + # have new generated tokens to return. + num_logits = num_tokens + else: + # This is the last chunk of prompt tokens to return. + num_logits = num_remaining_tokens + completed_prefill_reqs.append(req_id) + prompt_logprobs_dict[req_id] = logprobs_tensors + + if num_logits <= 0: + # This can happen for the final chunk if we prefilled exactly + # (num_prompt_tokens - 1) tokens for this request in the prior + # step. There are no more prompt logprobs to produce. + continue + + # Get the logits corresponding to this req's prompt tokens. + # If this is a partial request (i.e. chunked prefill), + # then there is prompt logprob generated for each index. + req_idx = self.input_batch.req_id_to_index[req_id] + offset = self.query_start_loc_np[req_idx].item() + prompt_hidden_states = hidden_states[offset:offset + num_logits] + logits = self.model.compute_logits(prompt_hidden_states, None) + + # Get the "target" tokens for each index. For prompt at index i, + # the token at prompt index i+1 is the "sampled" token we want + # to gather the logprob for. + tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits] + + # Compute prompt logprobs. + logprobs = self.sampler.compute_logprobs(logits) + token_ids, logprobs, ranks = self.sampler.gather_logprobs( + logprobs, num_prompt_logprobs, tgt_token_ids) + + # Transfer NPU->CPU async. + chunk_slice = slice(start_idx, start_idx + num_logits) + logprobs_tensors.logprob_token_ids[chunk_slice].copy_( + token_ids, non_blocking=True) + logprobs_tensors.logprobs[chunk_slice].copy_(logprobs, + non_blocking=True) + logprobs_tensors.selected_token_ranks[chunk_slice].copy_( + ranks, non_blocking=True) + + # Remove requests that have completed prefill from the batch + # num_prompt_logprobs_dict. + for req_id in completed_prefill_reqs: + del num_prompt_logprobs_dict[req_id] + del in_progress_dict[req_id] + + # Must synchronize the non-blocking NPU->CPU transfers. + if prompt_logprobs_dict: + torch.npu.synchronize() + + return prompt_logprobs_dict + + def get_supported_pooling_tasks(self): + model = self.get_model() + if not is_pooling_model(model): + return [] + + return list(model.pooler.get_supported_tasks()) + + def _build_drafter_prepare_inputs_torchair_param(self): + return False diff --git a/vllm_ascend/worker/mtp_proposer_v1.py b/vllm_ascend/worker/mtp_proposer_v1.py new file mode 100644 index 0000000..e8f369f --- /dev/null +++ b/vllm_ascend/worker/mtp_proposer_v1.py @@ -0,0 +1,439 @@ +import types + +import torch +import torch.nn as nn +import torchair +import vllm.envs as envs_vllm +from torchair import patch_for_hcom +from vllm.attention.layer import Attention +from vllm.config import (VllmConfig, get_layers_from_vllm_config, + set_current_vllm_config) +from vllm.forward_context import get_forward_context +from vllm.model_executor.model_loader import get_model_loader +from vllm.model_executor.model_loader.utils import ( + process_weights_after_loading, set_default_torch_dtype) +from vllm.v1.sample.metadata import SamplingMetadata + +from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.ascend_forward_context import set_ascend_forward_context +from vllm_ascend.attention.utils import AscendCommonAttentionMetadata +from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP +from vllm_ascend.torchair.models.torchair_deepseek_mtp import \ + TorchairDeepSeekMTP +from vllm_ascend.torchair.utils import (TORCHAIR_CACHE_DIR, + TorchairCommonAttentionMetadata) +from vllm_ascend.utils import ProfileExecuteDuration, lmhead_tp_enable + + +class MtpProposer: + + def __init__( + self, + vllm_config: VllmConfig, + runner, + ): + self.vllm_config = vllm_config + self.num_speculative_tokens = ( + vllm_config.speculative_config.num_speculative_tokens) + self.block_size = vllm_config.cache_config.block_size + self.hidden_size = vllm_config.model_config.get_hidden_size() + self.runner = runner + # persistent buffers for graph + self.input_ids = torch.zeros(self.runner.max_num_tokens, + dtype=torch.int32, + device=self.runner.device) + self.positions = torch.zeros(self.runner.max_num_tokens, + dtype=torch.int64, + device=self.runner.device) + self.hidden_states = torch.zeros( + (self.runner.max_num_tokens, self.hidden_size), + dtype=self.runner.dtype, + device=self.runner.device) + self.torchair_compiled_model = None # type: ignore + self.torchair_compiled_models = {} # type: ignore + self.torchair_graph_enabled = get_ascend_config( + ).torchair_graph_config.enabled + + @staticmethod + def prepare_inputs( + # [batch_size + 1] + cu_target_query_lens: torch.Tensor, + # [batch_size] + num_rejected_tokens: torch.Tensor, + token_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + slot_mapping: torch.Tensor, + is_torchair_graph: bool = False + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + torch.Tensor, torch.Tensor]: + # cu_target_query_lens: [0, a, a + b, a + b + c] + # num_rejected_tokens: [n1, n2, n3] + # num_tokens_per_req: [a - n1, b - n2, c - n3] + # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] + # token_indices: [0, 1, ..., a - n1 - 1, + # a, a + 1, ..., a + b - n2 - 1, + # a + b, a + b + 1, ..., a + b + c - n3 - 1] + # [0, a, a + b, a + b + c] -> [a, b, c] + query_len_per_req = (cu_target_query_lens[1:] - + cu_target_query_lens[:-1]) + # [a, b, c] -> [a - n1, b - n2, c - n3] + num_tokens_per_req = query_len_per_req - num_rejected_tokens + if is_torchair_graph: + cu_num_tokens = cu_target_query_lens + relative_index = query_len_per_req - num_rejected_tokens - 1 + token_indices = cu_num_tokens[:-1] + relative_index + # the seq len of each bath is padded to 1+num_speculative_tokens, thus input is same as the main model + target_token_ids = token_ids + target_positions = positions + target_hidden_states = hidden_states + target_slot_mapping = slot_mapping + else: + cu_num_tokens = torch.empty_like(cu_target_query_lens) + torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) + cu_num_tokens[0] = 0 + + # FIXME(woosuk): Avoid synchronization. + num_tokens = cu_num_tokens[-1].item() + token_indices = torch.zeros( + num_tokens, + dtype=torch.int32, + device=cu_num_tokens.device, + ) + + BLOCK_SIZE = 1024 + prepare_input_kernel( + token_indices, + cu_target_query_lens, + cu_num_tokens, + block_size=BLOCK_SIZE, + ) + target_token_ids = token_ids[token_indices] + target_positions = positions[token_indices] + target_hidden_states = hidden_states[token_indices] + target_slot_mapping = slot_mapping[token_indices] + return cu_num_tokens, token_indices, target_token_ids, target_positions, target_hidden_states, target_slot_mapping + + def propose( + self, + # [num_tokens] + target_token_ids: torch.Tensor, + # [num_tokens] + target_positions: torch.Tensor, + # [num_tokens, hidden_size] + target_hidden_states: torch.Tensor, + # [num_tokens] + target_slot_mapping: torch.Tensor, + # [batch_size] + next_token_ids: torch.Tensor, + # [batch_size + 1] starting with 0 + cu_num_tokens: torch.Tensor, + # [batch_size, max_num_blocks_per_req] + block_table: torch.Tensor, + sampling_metadata: SamplingMetadata, + token_indices=None) -> torch.Tensor: + num_tokens = target_token_ids.shape[0] + batch_size = next_token_ids.shape[0] + last_token_indices = cu_num_tokens[1:] - 1 + + # Shift the input ids by one token. + # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] + self.input_ids[:num_tokens - 1] = target_token_ids[1:] + # Replace the last token with the next token. + # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4] + if token_indices is not None and self.torchair_graph_enabled: + last_token_indices = token_indices + + self.input_ids[last_token_indices] = next_token_ids + + query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1] + max_query_len = query_lens.max().item() + + # FIXME: reorder_batch() needs to be called before build() + # because fields of attn_metadata_builder needs to be updated. + # However, currently reorder_batch() takes input_batch and + # scheduler_output as arguments, we should probably refactor + # the method to use new data structures which are independent + # from input_batch and scheduler_output. + # self.runner.attn_metadata_builder.reorder_batch( + # input_batch=self.runner.input_batch, + # scheduler_output=self.runner.scheduler_output, + # ) + is_running_torchair = self.torchair_graph_enabled and \ + not self.runner.with_prefill + + if is_running_torchair: + num_input_tokens = self.runner.graph_pad_size + else: + num_input_tokens = num_tokens + + seq_lens = target_positions[last_token_indices] + 1 + seq_lens = seq_lens.int() + common_attn_metadata = AscendCommonAttentionMetadata( + query_start_loc=cu_num_tokens[:batch_size + 1], + query_start_loc_cpu=cu_num_tokens[:batch_size + 1].cpu(), + seq_lens_cpu=seq_lens.cpu(), + num_reqs=batch_size, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + actual_seq_lengths_q=self.runner.actual_seq_lengths_q, + block_table_tensor=self.runner.input_batch.block_table[0]. + get_device_tensor(), + slot_mapping_cpu=target_slot_mapping, + positions=target_positions, + attn_mask=self.runner.attn_mask, + spec_attn_mask=self.runner.spec_attn_mask, + attn_state=self.runner.attn_state, + graph_pad_size=self.runner.graph_pad_size, + decode_token_per_req=self.runner.decode_token_per_req, + ) + attn_metadata = self.runner.attn_metadata_builder.build( + common_attn_metadata, self.runner.get_model()) + + self.positions[:num_tokens] = target_positions + self.hidden_states[:num_tokens] = target_hidden_states + + if not self.torchair_graph_enabled: + # torch mode need to update num_tokens_across_dp + # TODO: adapt enable_dbo later + (num_input_tokens, num_tokens_across_dp, with_prefill, + _) = self.runner._sync_metadata_across_dp( + num_tokens, self.runner.with_prefill, False) + attn_metadata.slot_mapping = target_slot_mapping + else: + # torchair mode can reuse self.runner.num_tokens_across_dp + num_tokens_across_dp = self.runner.num_tokens_across_dp + with_prefill = self.runner.with_prefill + + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_input_tokens, + with_prefill=with_prefill, + num_tokens_across_dp=num_tokens_across_dp, + reserved_mc2_mask=self.runner.reserved_mc2_mask, + in_profile_run=self.runner.in_profile_run, + num_actual_tokens=num_tokens): + with ProfileExecuteDuration().capture_async('mtp_forward'): + model_kwargs = {} + model_kwargs["attn_metadata"] = attn_metadata + if self.torchair_graph_enabled: + model_kwargs["kv_caches"] = self.runner.kv_caches[-1:] + if is_running_torchair: + torchair_compiled_model = self._get_torchair_lazy_compiled_model( + num_input_tokens) + hidden_states = torchair_compiled_model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + previous_hidden_states=self. + hidden_states[:num_input_tokens], + inputs_embeds=None, + intermediate_tensors=None, + spec_step_idx=0, + **model_kwargs) + else: + hidden_states = self.model( + input_ids=self.input_ids[:num_input_tokens], + positions=self.positions[:num_input_tokens], + previous_hidden_states=self. + hidden_states[:num_input_tokens], + kv_caches=self.runner.kv_caches[-1:]) + + num_indices = last_token_indices.shape[0] + if lmhead_tp_enable(): + if not self.runner.with_prefill: + max_num_reqs_across_dp = num_input_tokens + else: + max_num_reqs_across_dp = self.vllm_config.scheduler_config.max_num_seqs + last_token_indices = nn.functional.pad( + last_token_indices, (0, max_num_reqs_across_dp - num_indices)) + + sample_hidden_states = hidden_states[last_token_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + if lmhead_tp_enable() and num_indices < logits.shape[0]: + logits = logits[:num_indices] + draft_token_ids = logits.argmax(dim=-1) + + # [batch_size, 1] + return draft_token_ids.view(-1, 1) + + def load_model(self) -> None: + loader = get_model_loader(self.vllm_config.load_config) + + target_attn_layer_names = set( + get_layers_from_vllm_config(self.vllm_config, Attention).keys()) + draft_model_config = \ + self.vllm_config.speculative_config.draft_model_config + target_device = self.vllm_config.device_config.device + + with set_default_torch_dtype( + draft_model_config.dtype), set_current_vllm_config( + self.vllm_config): + if self.torchair_graph_enabled: + self.model = TorchairDeepSeekMTP( + vllm_config=self.vllm_config).to(target_device) + else: + self.model = CustomDeepSeekMTP( + vllm_config=self.vllm_config).to(target_device) + + draft_attn_layer_names = ( + get_layers_from_vllm_config(self.vllm_config, Attention).keys() - + target_attn_layer_names) + + assert len(draft_attn_layer_names) == 1 + self.attn_layer_name = next(iter(draft_attn_layer_names)) + + self.model.load_weights( + loader.get_all_weights( + self.vllm_config.speculative_config.draft_model_config, + self.model)) + process_weights_after_loading(self.model, draft_model_config, + target_device) + + @torch.inference_mode() + def dummy_run(self, + num_tokens: int, + with_prefill: bool = False, + skip_attn: bool = False, + num_reqs: int = 0, + num_tokens_across_dp=None) -> None: + if not self.torchair_graph_enabled: + # TODO: adapt enable_dbo later + (num_tokens, num_tokens_across_dp, with_prefill, + _) = self.runner._sync_metadata_across_dp(num_tokens, + with_prefill, False) + is_running_torchair = self.torchair_graph_enabled and \ + not with_prefill + + if is_running_torchair: + skip_attn = False + if skip_attn: + attn_metadata = None + else: + common_attn_metadata = TorchairCommonAttentionMetadata( + num_reqs=num_reqs, + num_actual_tokens=1, + actual_seq_lengths_q=self.runner.actual_seq_lengths_q, + attn_mask=self.runner.attn_mask, + spec_attn_mask=self.runner.spec_attn_mask, + decode_token_per_req=self.runner.decode_token_per_req, + ) + attn_metadata = self.runner.attn_metadata_builder.build_torchair_graph_dummy( + common_attn_metadata) + + input_ids = self.input_ids[:num_tokens] + positions = self.positions[:num_tokens] + previous_hidden_states = self.hidden_states[:num_tokens] + with set_ascend_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + with_prefill=with_prefill, + num_tokens_across_dp=num_tokens_across_dp, + reserved_mc2_mask=self.runner.reserved_mc2_mask, + in_profile_run=self.runner.in_profile_run, + num_actual_tokens=0): + if is_running_torchair: + assert attn_metadata is not None + torch._dynamo.mark_static(input_ids) + torch._dynamo.mark_static(positions) + torch._dynamo.mark_static(previous_hidden_states) + torch._dynamo.mark_static(attn_metadata.decode.block_table) + torch._dynamo.mark_static(attn_metadata.decode.input_positions) + if hasattr(attn_metadata.decode, "sin"): + torch._dynamo.mark_static(attn_metadata.decode.sin) + torch._dynamo.mark_static(attn_metadata.decode.cos) + torch._dynamo.mark_static(get_forward_context().mc2_mask) + torch._dynamo.mark_static(attn_metadata.slot_mapping) + torch._dynamo.mark_static(attn_metadata.decode.attn_mask) + torchair_compiled_model = self._get_torchair_lazy_compiled_model( + num_tokens) + torchair_compiled_model( + input_ids=input_ids, + positions=positions, + previous_hidden_states=previous_hidden_states, + inputs_embeds=None, + intermediate_tensors=None, + attn_metadata=attn_metadata, + kv_caches=self.runner.kv_caches[-1:], + spec_step_idx=0) + else: + self.model(input_ids=input_ids, + positions=positions, + previous_hidden_states=previous_hidden_states) + + def _get_torchair_lazy_compiled_model(self, batch_size: int): + if batch_size < 0 or batch_size > self.runner.torchair_graph_batch_sizes[ + -1]: + raise ValueError( + f"Bad graph batch size:{batch_size}! max_graph_batch_sizes:{self.runner.torchair_graph_batch_sizes[-1]}" + ) + + compiled_model = self.torchair_compiled_models.get( + batch_size + ) if self.runner.use_cached_npu_graph else self.torchair_compiled_model + + if compiled_model: + return compiled_model + + patch_for_hcom() + config = torchair.CompilerConfig() + config.experimental_config.frozen_parameter = True + config.experimental_config.tiling_schedule_optimize = True + config.experimental_config.enable_view_optimize = \ + get_ascend_config().torchair_graph_config.enable_view_optimize + torch.npu.set_compile_mode(jit_compile=False) + if not self.runner.use_cached_npu_graph: + npu_backend = torchair.get_npu_backend(compiler_config=config) + self.torchair_compiled_model = torch.compile( + self.model, + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + backend=npu_backend) + return self.torchair_compiled_model + else: + # Generate a new forward proxy code object to prevent the invalidation of + # compilation cache caused by dynamo retracing + forward_proxy_name = f"{self.model.__class__.__name__}_forward_with_batch_size_{batch_size}" + forward_fn = self.model.forward + code = forward_fn.__code__ + # Mark code object with a new proxy name + modified_code = code.replace(co_name=forward_proxy_name, ) + + modified_func = types.FunctionType(modified_code, + forward_fn.__globals__, + name=forward_proxy_name, + argdefs=forward_fn.__defaults__) + + self.model.__dict__[forward_proxy_name] = modified_func.__get__( + self.model, nn.Module) + self.torchair_compiled_models[ + batch_size] = torchair.inference.cache_compile( + self.model.__dict__[forward_proxy_name], + dynamic=True, + fullgraph=envs_vllm.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, + cache_dir=TORCHAIR_CACHE_DIR, + config=config, + ge_cache=False) + return self.torchair_compiled_models[batch_size] + + +# TODO Using torch instead of triton may result in poor performance +def prepare_input_kernel(out_ptr: torch.Tensor, cu_query_lens: torch.Tensor, + cu_num_tokens: torch.Tensor, block_size: int): + device = cu_query_lens.device + dtype = out_ptr.dtype + + offsets = torch.arange(block_size, device=device, dtype=dtype) + start_pos = cu_num_tokens[:-1] + end_pos = cu_num_tokens[1:] + num_tokens = end_pos - start_pos + + global_indices = (start_pos.view(-1, 1) + offsets.view(1, -1)) + values = (cu_query_lens[:-1].view(-1, 1) + offsets.view(1, -1)) + + mask = (offsets.view(1, -1) < num_tokens.view(-1, 1)) + + global_indices_flat = global_indices[mask] + values_flat = values[mask] + out_ptr[global_indices_flat] = values_flat diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py new file mode 100644 index 0000000..cbd25a8 --- /dev/null +++ b/vllm_ascend/worker/npu_input_batch.py @@ -0,0 +1,821 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/gpu_input_batch.py +# + +from dataclasses import dataclass +from typing import Optional, cast + +import numpy as np +import torch +from typing_extensions import deprecated +from vllm.lora.request import LoRARequest +from vllm.multimodal.inputs import (MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) +from vllm.pooling_params import PoolingParams +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.utils import swap_dict_values +from vllm.v1.outputs import LogprobsTensors +from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.sample.logits_processor import (BatchUpdateBuilder, + LogitsProcessors, + MoveDirectionality) +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.spec_decode.utils import is_spec_decode_unsupported +from vllm.v1.utils import copy_slice +from vllm.v1.worker.block_table import MultiGroupBlockTable + +from vllm_ascend.utils import vllm_version_is + + +@dataclass +class CachedRequestState: + + req_id: str + prompt_token_ids: list[int] + mm_kwargs: list[MultiModalKwargsItem] + mm_positions: list[PlaceholderRange] + # TODO: remove Optional after 0.10.1.1 + mm_hashes: Optional[list[str]] + sampling_params: Optional[SamplingParams] + pooling_params: Optional[PoolingParams] + generator: Optional[torch.Generator] + + block_ids: tuple[list[int], ...] + num_computed_tokens: int + output_token_ids: list[int] + + mrope_positions: Optional[torch.Tensor] = None + mrope_position_delta: Optional[int] = None + + lora_request: Optional[LoRARequest] = None + + def __post_init__(self): + self.num_prompt_tokens = len(self.prompt_token_ids) + + @property + def num_tokens(self) -> int: + return self.num_prompt_tokens + len(self.output_token_ids) + + # Temporary back-compatibility for plugins that define model runner + @property + @deprecated("`mm_inputs` is superseded by `mm_kwargs` and will be " + "removed in v0.13. Please use `mm_kwargs` instead.") + def mm_inputs(self) -> list[MultiModalKwargs]: + return [MultiModalKwargs([item]) for item in self.mm_kwargs] + + def get_token_id(self, idx: int) -> int: + if idx < self.num_prompt_tokens: + return self.prompt_token_ids[idx] + else: + return self.output_token_ids[idx - self.num_prompt_tokens] + + +class InputBatch: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_batched_tokens: int, + device: torch.device, + pin_memory: bool, + vocab_size: int, + block_sizes: list[int], # The block_size of each kv cache group + logitsprocs: Optional[LogitsProcessors] = None, + is_spec_decode: bool = False, + is_pooling_model: bool = False, + ): + self.is_pooling_model = is_pooling_model + self.is_spec_decode = is_spec_decode + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_batched_tokens = max_num_batched_tokens + self.device = device + self.pin_memory = pin_memory + self.vocab_size = vocab_size + + self._req_ids: list[Optional[str]] = [] + self.req_id_to_index: dict[str, int] = {} + + # TODO(woosuk): This buffer could be too large if max_model_len is big. + # Find a way to reduce the CPU memory usage. + # This buffer is not directly transferred to the NPU, so it does not + # need to be pinned. + self.token_ids_cpu_tensor = torch.zeros( + (max_num_reqs, max_model_len), + device="cpu", + dtype=torch.int32, + pin_memory=False, + ) + self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() + self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) + self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_computed_tokens_cpu_tensor = torch.zeros( + (max_num_reqs, ), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.num_computed_tokens_cpu = \ + self.num_computed_tokens_cpu_tensor.numpy() + + # Block table. + self.block_table = MultiGroupBlockTable( + max_num_reqs=max_num_reqs, + max_model_len=max_model_len, + max_num_batched_tokens=max_num_batched_tokens, + pin_memory=pin_memory, + device=device, + block_sizes=block_sizes, + ) + + # Sampling-related. + self.temperature = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.temperature_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.temperature_cpu = self.temperature_cpu_tensor.numpy() + self.greedy_reqs: set[str] = set() + self.random_reqs: set[str] = set() + + self.top_p = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.top_p_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.top_p_cpu = self.top_p_cpu_tensor.numpy() + self.top_p_reqs: set[str] = set() + + self.top_k = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device=device) + self.top_k_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.top_k_cpu = self.top_k_cpu_tensor.numpy() + self.top_k_reqs: set[str] = set() + + # IDs of requests which do not support spec decoding + self.spec_decode_unsupported_reqs: set[str] = set() + + # Frequency penalty related data structures + self.frequency_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.frequency_penalties_cpu_tensor = torch.empty( + (max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.frequency_penalties_cpu = \ + self.frequency_penalties_cpu_tensor.numpy() + self.frequency_penalties_reqs: set[str] = set() + + # Presence penalty related data structures + self.presence_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.presence_penalties_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy( + ) + self.presence_penalties_reqs: set[str] = set() + + # Repetition penalty related data structures + self.repetition_penalties = torch.empty((max_num_reqs, ), + dtype=torch.float, + device=device) + self.repetition_penalties_cpu_tensor = torch.empty( + (max_num_reqs, ), + dtype=torch.float, + device="cpu", + pin_memory=pin_memory) + self.repetition_penalties_cpu = \ + self.repetition_penalties_cpu_tensor.numpy() + self.repetition_penalties_reqs: set[str] = set() + + # lora related + self.request_lora_mapping = np.zeros((self.max_num_reqs, ), + dtype=np.int32) + self.lora_id_to_request_ids: dict[int, set[str]] = {} + self.lora_id_to_lora_request: dict[int, LoRARequest] = {} + + # req_index -> generator + # NOTE(woosuk): The indices of the requests that do not have their own + # generator should not be included in the dictionary. + self.generators: dict[int, torch.Generator] = {} + + self.num_logprobs: dict[str, int] = {} + # NOTE(rob): num_prompt_logprobs only includes reqs + # that are currently in the prefill phase. + self.num_prompt_logprobs: dict[str, int] = {} + + # To accumulate prompt logprobs tensor chunks across prefill steps. + self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {} + + # Internal representation of per-step batch state changes, used for + # reordering persistent batch and generating logitsprocs batch state + # updates. Should reset each step. + self.batch_update_builder = BatchUpdateBuilder() + + # TODO convert this to LogitsProcessor + self.has_allowed_token_ids: set[str] = set() + # NOTE(lufang): In the mask tensor, if the corresponding token allowed, + # the value is False. Since we use masked_fill_ to set -inf. + self.allowed_token_ids_mask: Optional[torch.Tensor] = None + self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None + + # req_index -> bad_words_token_ids + self.bad_words_token_ids: dict[int, list[list[int]]] = {} + + self.logits_processing_needs_token_ids = np.zeros(max_num_reqs, + dtype=bool) + + self.req_output_token_ids: list[Optional[list[int]]] = [] + + # Store provided logitsprocs. If none are provided, initialize empty + # data structure + self.logitsprocs = logitsprocs or LogitsProcessors() + + # This is updated each time the batch constituents change. + self.sampling_metadata = self._make_sampling_metadata() + + self.pooling_params: dict[str, PoolingParams] = {} + + @property + def req_ids(self) -> list[str]: + # None elements should only be present transiently + # while performing state updates to the batch. + return cast(list[str], self._req_ids) + + def _register_add_request(self, request: "CachedRequestState") -> int: + """Track add-request operations for logits processors. + Not applicable to pooling models. + """ + + # Detailed added request metadata is only required for non-pooling + # models, to support logitsprocs + assert request.sampling_params + + # Fill the next empty index if there is one. + if (new_req_index := self.batch_update_builder.pop_removed()) is None: + # Append to end otherwise. + new_req_index = self.num_reqs + + assert new_req_index < self.max_num_reqs + self.batch_update_builder.added.append( + (new_req_index, request.sampling_params, request.prompt_token_ids, + request.output_token_ids)) + return new_req_index + + def add_request( + self, + request: "CachedRequestState", + ) -> int: + if not self.is_pooling_model: + # New request index bookkeeping for autoregressive models. + req_index = self._register_add_request(request) + else: + req_index = self.num_reqs + + req_id = request.req_id + if req_index == len(self._req_ids): + self._req_ids.append(req_id) + self.req_output_token_ids.append(request.output_token_ids) + else: + self._req_ids[req_index] = req_id + self.req_output_token_ids[req_index] = request.output_token_ids + + self.req_id_to_index[req_id] = req_index + + # Copy the prompt token ids and output token ids. + num_prompt_tokens = len(request.prompt_token_ids) + self.num_prompt_tokens[req_index] = num_prompt_tokens + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + start_idx = num_prompt_tokens + end_idx = start_idx + len(request.output_token_ids) + self.token_ids_cpu[req_index, + start_idx:end_idx] = request.output_token_ids + # Number of token ids in token_ids_cpu. + # NOTE(woosuk): This may include spec decode tokens. + self.num_tokens[req_index] = request.num_tokens + # Number of tokens without spec decode tokens. + self.num_tokens_no_spec[req_index] = request.num_tokens + + self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens + self.block_table.add_row(request.block_ids, req_index) + + if sampling_params := request.sampling_params: + if (self.is_spec_decode + and is_spec_decode_unsupported(sampling_params)): + self.spec_decode_unsupported_reqs.add(req_id) + if sampling_params.sampling_type == SamplingType.GREEDY: + # Avoid later division by zero. + self.temperature_cpu[req_index] = -1.0 + self.greedy_reqs.add(req_id) + else: + self.temperature_cpu[req_index] = sampling_params.temperature + self.random_reqs.add(req_id) + + self.top_p_cpu[req_index] = sampling_params.top_p + if sampling_params.top_p < 1: + self.top_p_reqs.add(req_id) + top_k = sampling_params.top_k + if 0 < top_k < self.vocab_size: + self.top_k_reqs.add(req_id) + else: + top_k = self.vocab_size + self.top_k_cpu[req_index] = top_k + self.frequency_penalties_cpu[ + req_index] = sampling_params.frequency_penalty + if sampling_params.frequency_penalty != 0.0: + self.frequency_penalties_reqs.add(req_id) + self.presence_penalties_cpu[ + req_index] = sampling_params.presence_penalty + if sampling_params.presence_penalty != 0.0: + self.presence_penalties_reqs.add(req_id) + self.repetition_penalties_cpu[ + req_index] = sampling_params.repetition_penalty + if sampling_params.repetition_penalty != 1.0: + self.repetition_penalties_reqs.add(req_id) + + # NOTE(woosuk): self.generators should not include the requests that + # do not have their own generator. + if request.generator is not None: + self.generators[req_index] = request.generator + + if sampling_params.logprobs is not None: + self.num_logprobs[req_id] = (self.vocab_size + if sampling_params.logprobs == -1 + else sampling_params.logprobs) + if sampling_params.prompt_logprobs is not None: + self.num_prompt_logprobs[ + req_id] = sampling_params.prompt_logprobs + + if sampling_params.allowed_token_ids: + self.has_allowed_token_ids.add(req_id) + if self.allowed_token_ids_mask_cpu_tensor is None: + # Lazy allocation for this tensor, which can be large. + # False means we don't fill with -inf. + self.allowed_token_ids_mask = torch.zeros( + self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device=self.device) + self.allowed_token_ids_mask_cpu_tensor = torch.zeros( + self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device="cpu") + self.allowed_token_ids_mask_cpu_tensor[req_index] = True + # False means we don't fill with -inf. + self.allowed_token_ids_mask_cpu_tensor[req_index][ + sampling_params.allowed_token_ids] = False + + if sampling_params.bad_words_token_ids: + self.bad_words_token_ids[ + req_index] = sampling_params.bad_words_token_ids + elif pooling_params := request.pooling_params: + self.pooling_params[req_id] = pooling_params + self.logits_processing_needs_token_ids[req_index] = ( + pooling_params.requires_token_ids) + else: + raise NotImplementedError(request) + + # Add request lora ID + if request.lora_request: + lora_id = request.lora_request.lora_int_id + if lora_id not in self.lora_id_to_request_ids: + self.lora_id_to_request_ids[lora_id] = set() + + self.request_lora_mapping[req_index] = lora_id + self.lora_id_to_request_ids[lora_id].add(request.req_id) + self.lora_id_to_lora_request[lora_id] = request.lora_request + else: + # No LoRA + self.request_lora_mapping[req_index] = 0 + + return req_index + + def remove_request(self, req_id: str) -> Optional[int]: + """This method must always be followed by a call to condense(). + + Args: + req_id: request to remove + + Returns: + Removed request index, or `None` if `req_id` not recognized + """ + + req_index = self.req_id_to_index.pop(req_id, None) + if req_index is None: + return None + if not self.is_pooling_model: + # Autoregressive models require bookkeeping of removed requests to + # support logitsprocs. + self.batch_update_builder.removed_append(req_index) + self._req_ids[req_index] = None + self.req_output_token_ids[req_index] = None + + self.greedy_reqs.discard(req_id) + self.random_reqs.discard(req_id) + self.top_p_reqs.discard(req_id) + self.top_k_reqs.discard(req_id) + self.spec_decode_unsupported_reqs.discard(req_id) + self.frequency_penalties_reqs.discard(req_id) + self.presence_penalties_reqs.discard(req_id) + self.repetition_penalties_reqs.discard(req_id) + self.generators.pop(req_index, None) + self.num_logprobs.pop(req_id, None) + self.num_prompt_logprobs.pop(req_id, None) + self.in_progress_prompt_logprobs_cpu.pop(req_id, None) + + # LoRA + lora_id = self.request_lora_mapping[req_index] + if lora_id != 0: + self.lora_id_to_request_ids[lora_id].discard(req_id) + if len(self.lora_id_to_request_ids[lora_id]) == 0: + self.lora_id_to_request_ids.pop(lora_id) + self.lora_id_to_lora_request.pop(lora_id) + self.request_lora_mapping[req_index] = 0 + + self.has_allowed_token_ids.discard(req_id) + if self.allowed_token_ids_mask_cpu_tensor is not None: + # False means we don't fill with -inf. + self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) + self.bad_words_token_ids.pop(req_index, None) + self.pooling_params.pop(req_id, None) + return req_index + + def swap_states(self, i1: int, i2: int) -> None: + # For autoregressive models, track detailed request reordering info + # to support logitsprocs + self.batch_update_builder.moved.append( + (i1, i2, MoveDirectionality.SWAP)) + old_id_i1 = self._req_ids[i1] + old_id_i2 = self._req_ids[i2] + self._req_ids[i1], self._req_ids[i2] =\ + self._req_ids[i2], self._req_ids[i1] # noqa + self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\ + self.req_output_token_ids[i2], self.req_output_token_ids[i1] + assert old_id_i1 is not None and old_id_i2 is not None + self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\ + self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1] + self.num_tokens[i1], self.num_tokens[i2] =\ + self.num_tokens[i2], self.num_tokens[i1] + self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\ + self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1] + self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\ + self.num_prompt_tokens[i2], self.num_prompt_tokens[i1] + self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\ + self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1] + self.temperature_cpu[i1], self.temperature_cpu[i2] =\ + self.temperature_cpu[i2], self.temperature_cpu[i1] + self.top_p_cpu[i1], self.top_p_cpu[i2] =\ + self.top_p_cpu[i2], self.top_p_cpu[i1] + self.top_k_cpu[i1], self.top_k_cpu[i2] =\ + self.top_k_cpu[i2], self.top_k_cpu[i1] + self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\ + self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1] + self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\ + self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1] + self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\ + self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1] + + # NOTE: the following is unsafe + # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\ + # self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...] + # instead, we need to temporiarily copy the data for one of the indices + # TODO(lucas): optimize this by only copying valid indices + tmp = self.token_ids_cpu[i1, ...].copy() + self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] + self.token_ids_cpu[i2, ...] = tmp + + swap_dict_values(self.generators, i1, i2) + swap_dict_values(self.bad_words_token_ids, i1, i2) + + self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\ + self.request_lora_mapping[i2], self.request_lora_mapping[i1] + + if self.allowed_token_ids_mask_cpu_tensor is not None: + self.allowed_token_ids_mask_cpu_tensor[i1], \ + self.allowed_token_ids_mask_cpu_tensor[i2] =\ + self.allowed_token_ids_mask_cpu_tensor[i2], \ + self.allowed_token_ids_mask_cpu_tensor[i1] + self.block_table.swap_row(i1, i2) + + def condense(self) -> None: + """Slide non-empty requests down into lower, empty indices. + + Any consecutive empty indices at the very end of the list are not + filled. + + Args: + empty_req_indices: empty indices which may be filled. + + Returns: + swaps: list of (from,to) swap tuples for moved requests + empty_req_indices: indices not filled by condensation + """ + num_reqs = self.num_reqs + + if self.is_pooling_model: + # Will be contiguous in pooling case, just trim the lists. + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] + return + + if not (empty_req_indices := self.batch_update_builder.removed): + # All removed requests were replaced by added requests, or else no + # requests were removed at all. No condense() needed + return + if num_reqs == 0: + # The batched states are empty. + self._req_ids.clear() + self.req_output_token_ids.clear() + return + + # NOTE(woosuk): This function assumes that the empty_req_indices + # is sorted in descending order. + last_req_index = num_reqs + len(empty_req_indices) - 1 + while empty_req_indices: + # Find the largest non-empty index. + while last_req_index in empty_req_indices: + last_req_index -= 1 + + # Find the smallest empty index. + empty_index = self.batch_update_builder.peek_removed() + assert empty_index is not None + if empty_index >= last_req_index: + break + + # Move active request down into empty request + # index. + self.batch_update_builder.pop_removed() + # Autoregressive models require detailed tracking of condense + # operations to support logitsprocs + self.batch_update_builder.moved.append( + (last_req_index, empty_index, + MoveDirectionality.UNIDIRECTIONAL)) + req_id = self._req_ids[last_req_index] + output_token_ids = self.req_output_token_ids[last_req_index] + assert req_id is not None + self._req_ids[empty_index] = req_id + self._req_ids[last_req_index] = None + self.req_output_token_ids[empty_index] = output_token_ids + self.req_output_token_ids[last_req_index] = None + self.req_id_to_index[req_id] = empty_index + + num_tokens = self.num_tokens[last_req_index] + self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ + last_req_index, :num_tokens] + self.num_tokens[empty_index] = num_tokens + self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ + last_req_index] + self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[ + last_req_index] + self.num_computed_tokens_cpu[ + empty_index] = self.num_computed_tokens_cpu[last_req_index] + self.block_table.move_row(last_req_index, empty_index) + self.temperature_cpu[empty_index] = self.temperature_cpu[ + last_req_index] + self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] + self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] + self.frequency_penalties_cpu[ + empty_index] = self.frequency_penalties_cpu[last_req_index] + self.presence_penalties_cpu[ + empty_index] = self.presence_penalties_cpu[last_req_index] + self.repetition_penalties_cpu[ + empty_index] = self.repetition_penalties_cpu[last_req_index] + generator = self.generators.pop(last_req_index, None) + if generator is not None: + self.generators[empty_index] = generator + + self.request_lora_mapping[empty_index] = self.request_lora_mapping[ + last_req_index] + + # TODO convert these to LogitsProcessors + if self.allowed_token_ids_mask_cpu_tensor is not None: + self.allowed_token_ids_mask_cpu_tensor[ + empty_index] = self.allowed_token_ids_mask_cpu_tensor[ + last_req_index] + + bad_words_token_ids = self.bad_words_token_ids.pop( + last_req_index, None) + if bad_words_token_ids is not None: + self.bad_words_token_ids[empty_index] = bad_words_token_ids + + # Decrement last_req_index since it is now empty. + last_req_index -= 1 + + # Trim lists to the batch size. + del self._req_ids[num_reqs:] + del self.req_output_token_ids[num_reqs:] + + def refresh_metadata(self): + """Apply any batch updates to sampling metadata.""" + + if self.is_pooling_model: + # Batch changes every step for pooling models. + self.sampling_metadata = self._make_sampling_metadata() + return + + # For non-pooling models - generate and apply logitsprocs update; + # reset batch update tracking. + # Update sampling metadata if batch state is changed. + batch_update = self.batch_update_builder.get_and_reset(self.num_reqs) + for logit_proc in self.logitsprocs.all: + logit_proc.update_state(batch_update) + if batch_update: + self.sampling_metadata = self._make_sampling_metadata() + + def _make_sampling_metadata(self) -> SamplingMetadata: + num_reqs = self.num_reqs + if not self.all_greedy: + temperature = copy_slice(self.temperature_cpu_tensor, + self.temperature, num_reqs) + else: + temperature = None + if not self.no_top_p: + copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs) + if not self.no_top_k: + copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs) + + if not self.no_penalties: + # Since syncing these tensors is expensive only copy them + # if necessary i.e. if there are requests which require + # penalties to be applied during sampling. + copy_slice(self.frequency_penalties_cpu_tensor, + self.frequency_penalties, num_reqs) + copy_slice(self.presence_penalties_cpu_tensor, + self.presence_penalties, num_reqs) + copy_slice(self.repetition_penalties_cpu_tensor, + self.repetition_penalties, num_reqs) + + needs_prompt_token_ids = ( + not self.no_penalties + or self.logits_processing_needs_token_ids[:num_reqs].any()) + if needs_prompt_token_ids: + # The prompt tokens are used only for applying penalties or + # step pooling during the sampling/pooling process. + # Hence copy these tensors only when there are requests which + # need penalties/step_pooler to be applied. + prompt_token_ids = self._make_prompt_token_ids_tensor() + else: + prompt_token_ids = None + + allowed_token_ids_mask: Optional[torch.Tensor] = None + if not self.no_allowed_token_ids: + assert self.allowed_token_ids_mask is not None + copy_slice(self.allowed_token_ids_mask_cpu_tensor, + self.allowed_token_ids_mask, num_reqs) + allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] + + return SamplingMetadata( + temperature=temperature, + all_greedy=self.all_greedy, + all_random=self.all_random, + top_p=None if self.no_top_p else self.top_p[:num_reqs], + top_k=None if self.no_top_k else self.top_k[:num_reqs], + generators=self.generators, + max_num_logprobs=self.max_num_logprobs, + prompt_token_ids=prompt_token_ids, + frequency_penalties=self.frequency_penalties[:num_reqs], + presence_penalties=self.presence_penalties[:num_reqs], + repetition_penalties=self.repetition_penalties[:num_reqs], + output_token_ids=cast(list[list[int]], self.req_output_token_ids), + no_penalties=self.no_penalties, + allowed_token_ids_mask=allowed_token_ids_mask, + bad_words_token_ids=self.bad_words_token_ids, + logitsprocs=self.logitsprocs, + ) + + @property + def pooling_metadata(self) -> PoolingMetadata: + if len(self.pooling_params) == 0: + pooling_params = [] + else: + # Note, for now this assumes that all request in the batch + # are either sampling or pooling requests + assert len(self.req_ids) == len(self.pooling_params) + pooling_params = [ + self.pooling_params[req_id] for req_id in self.req_ids + ] + if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"): + return PoolingMetadata( + prompt_lens=torch.from_numpy( + self.num_prompt_tokens[:self.num_reqs]).to(self.device), + prompt_token_ids=self.sampling_metadata.prompt_token_ids, + pooling_params=pooling_params, + ) + else: + return PoolingMetadata( + prompt_lens=torch.from_numpy( + self.num_prompt_tokens[:self.num_reqs]), + prompt_token_ids=self.sampling_metadata.prompt_token_ids, + pooling_params=pooling_params, + ) + + def _make_prompt_token_ids_tensor(self) -> torch.Tensor: + max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max() + prompt_token_ids_cpu_tensor = torch.empty( + (self.num_reqs, max_prompt_len), + device="cpu", + dtype=torch.int64, + pin_memory=self.pin_memory, + ) + prompt_token_ids = prompt_token_ids_cpu_tensor.numpy() + prompt_token_ids[:] = self.token_ids_cpu[:self. + num_reqs, :max_prompt_len] + # Use the value of vocab_size as a pad since we don't have a + # token_id of this value. + for i in range(self.num_reqs): + prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size + return prompt_token_ids_cpu_tensor.to(device=self.device, + non_blocking=True) + + def make_lora_inputs( + self, num_scheduled_tokens: np.ndarray + ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]: + """ + Given the num_scheduled_tokens for each request in the batch, return + datastructures used to activate the current LoRAs. + Returns: + 1. prompt_lora_mapping: A tuple of size self.num_reqs where, + prompt_lora_mapping[i] is the LoRA id to use for the ith prompt. + 2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens) + where, token_lora_mapping[i] is the LoRA id to use for ith token. + 3. lora_requests: Set of relevant LoRA requests. + """ + + req_lora_mapping = self.request_lora_mapping[:self.num_reqs] + prompt_lora_mapping = tuple(req_lora_mapping) + token_lora_mapping = tuple( + req_lora_mapping.repeat(num_scheduled_tokens)) + active_lora_requests: set[LoRARequest] = set( + self.lora_id_to_lora_request.values()) + + return prompt_lora_mapping, token_lora_mapping, active_lora_requests + + @property + def num_reqs(self) -> int: + return len(self.req_id_to_index) + + @property + def all_greedy(self) -> bool: + return len(self.random_reqs) == 0 + + @property + def all_random(self) -> bool: + return len(self.greedy_reqs) == 0 + + @property + def no_top_p(self) -> bool: + return len(self.top_p_reqs) == 0 + + @property + def no_top_k(self) -> bool: + return len(self.top_k_reqs) == 0 + + @property + def no_penalties(self) -> bool: + return (len(self.presence_penalties_reqs) == 0 + and len(self.frequency_penalties_reqs) == 0 + and len(self.repetition_penalties_reqs) == 0) + + @property + def max_num_logprobs(self) -> Optional[int]: + return max(self.num_logprobs.values()) if self.num_logprobs else None + + @property + def no_prompt_logprob(self) -> bool: + return not self.num_prompt_logprobs + + @property + def no_allowed_token_ids(self) -> bool: + return len(self.has_allowed_token_ids) == 0 diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py new file mode 100644 index 0000000..1062d47 --- /dev/null +++ b/vllm_ascend/worker/worker_v1.py @@ -0,0 +1,354 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/gpu_worker.py +# + +import copy +from typing import Optional + +import torch +import torch.nn as nn +import torch_npu +import vllm.envs as envs_vllm +from torch_npu.op_plugin.atb._atb_ops import _register_atb_extensions +from vllm.config import VllmConfig +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized, + has_kv_transfer_group) +from vllm.distributed.parallel_state import get_pp_group, get_tp_group +from vllm.logger import logger +from vllm.lora.request import LoRARequest +from vllm.sequence import IntermediateTensors +from vllm.tasks import SupportedTask +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, GiB_bytes +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec +from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput +from vllm.v1.worker.worker_base import WorkerBase + +from vllm_ascend.ascend_config import init_ascend_config +from vllm_ascend.device_allocator.camem import CaMemAllocator +from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel +from vllm_ascend.platform import NPUPlatform +from vllm_ascend.utils import (init_ascend_soc_version, + register_ascend_customop, sleep_mode_enabled, + try_register_lib, vllm_version_is) +from vllm_ascend.worker.model_runner_v1 import NPUModelRunner + +if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")): + from vllm.v1.outputs import DraftTokenIds +else: + DraftTokenIds = None + + +class NPUWorker(WorkerBase): + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False, + # Additional parameters for compatibility with vllm + **kwargs): + """Initialize the worker for Ascend.""" + # register patch for vllm + from vllm_ascend.utils import adapt_patch + adapt_patch() + # Register ops when worker init. + from vllm_ascend import ops + ops.register_dummy_fusion_op() + _register_atb_extensions() + register_ascend_customop() + # init ascend config and soc version + init_ascend_config(vllm_config) + init_ascend_soc_version() + + super().__init__(vllm_config=vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=is_driver_worker) + + # Try to import mindie_turbo to accelerate vLLM inference. + try_register_lib( + "mindie_turbo", + "MindIE Turbo is installed. vLLM inference will be accelerated with MindIE Turbo." + ) + if self.cache_config.cache_dtype == "auto": + self.cache_dtype = self.model_config.dtype + else: + self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + self.cache_config.cache_dtype] + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + + self.profiler = self._init_profiler() + + def sleep(self, level: int = 1) -> None: + if not sleep_mode_enabled(): + raise ValueError( + "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." + ) + free_bytes_before_sleep = NPUPlatform.mem_get_info()[0] + allocator = CaMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = NPUPlatform.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self, tags: Optional[list[str]] = None) -> None: + if not sleep_mode_enabled(): + raise ValueError( + "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." + ) + allocator = CaMemAllocator.get_instance() + allocator.wake_up(tags=tags) + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + def _init_device(self): + device = torch.device(f"npu:{self.local_rank}") + NPUPlatform.set_device(device) + NPUPlatform.empty_cache() + self.init_npu_memory = NPUPlatform.mem_get_info()[0] + # Initialize the distributed environment. + self._init_worker_distributed_environment() + # Set random seed. + NPUPlatform.seed_everything(self.model_config.seed) + return device + + def init_device(self): + device = self._init_device() + # Init ModelRunner here, so that we have access to self.device. + self.model_runner = NPUModelRunner(self.vllm_config, device) + + def determine_available_memory(self) -> int: + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + NPUPlatform.clear_npu_memory() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + _, total_npu_memory = NPUPlatform.mem_get_info() + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + free_npu_memory, _ = NPUPlatform.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + assert self.init_npu_memory > free_npu_memory, ( + "Error in memory profiling. " + f"Initial free memory {self.init_npu_memory}, current free memory" + f" {free_npu_memory}. This happens when the NPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + # Get the peak memory allocation recorded by torch + peak_memory = torch_npu.npu.memory_stats()["allocated_bytes.all.peak"] + # TODO: don`t need impl this func after empty_cache in + # Worker.determine_num_available_blocks() unified` + NPUPlatform.empty_cache() + torch_allocated_bytes = torch_npu.npu.memory_stats( + )["allocated_bytes.all.current"] + total_allocated_bytes = torch_npu.npu.mem_get_info( + )[1] - torch_npu.npu.mem_get_info()[0] + non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + if non_torch_allocations > 0: + peak_memory += non_torch_allocations + available_kv_cache_memory = int( + total_npu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) + available_kv_cache_memory = int(max(available_kv_cache_memory, 0)) + logger.info( + f"Available memory: {available_kv_cache_memory}, total memory: {total_npu_memory}" + ) + return available_kv_cache_memory + + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> Optional[ModelRunnerOutput]: + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = IntermediateTensors( + get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group())) + + output = self.model_runner.execute_model(scheduler_output, + intermediate_tensors) + parallel_config = self.vllm_config.parallel_config + if parallel_config.distributed_executor_backend != "external_launcher" \ + and not get_pp_group().is_last_rank: + assert isinstance(output, IntermediateTensors) + get_pp_group().send_tensor_dict(output.tensors, + all_gather_group=get_tp_group()) + if not has_kv_transfer_group(): + return None + + kv_connector_output = output.kv_connector_output + finished_sending = kv_connector_output.finished_sending + finished_recving = kv_connector_output.finished_recving + + if not finished_sending and not finished_recving: + return EMPTY_MODEL_RUNNER_OUTPUT + + new_output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + new_output.kv_connector_output = kv_connector_output + return new_output + + assert isinstance(output, ModelRunnerOutput) + return output + + def load_model(self) -> None: + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CaMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() # type: ignore + with context: + self.model_runner.load_model() + + def compile_or_warm_up_model(self) -> None: + # Note: need to adapt for graph mode. + warmup_sizes = (self.vllm_config.compilation_config.compile_sizes + or []).copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) + if not self.model_config.enforce_eager: + self.model_runner.capture_model() + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + NPUPlatform.seed_everything(self.model_config.seed) + + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + + def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: + return self.model_runner.get_kv_cache_spec() + + def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None: + """Allocate NPU KV cache with the specified kv_cache_config.""" + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CaMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() # type: ignore + with context: + self.model_runner.initialize_kv_cache(kv_cache_config) + + def profile(self, is_start: bool = True): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + if is_start: + self.profiler.start() + else: + self.profiler.stop() + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.model_runner.remove_lora(lora_id) + + def list_loras(self) -> set[int]: + return self.model_runner.list_loras() + + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + + def execute_dummy_batch(self) -> None: + self.model_runner._dummy_run(1) + + def _init_worker_distributed_environment(self) -> None: + """Initialize the distributed environment.""" + init_distributed_environment(self.parallel_config.world_size, + self.rank, self.distributed_init_method, + self.local_rank, "hccl") + ensure_model_parallel_initialized( + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size) + init_ascend_model_parallel(self.parallel_config) + ensure_kv_transfer_initialized(self.vllm_config) + + def _init_profiler(self): + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs_vllm.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs_vllm.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + + experimental_config = torch_npu.profiler._ExperimentalConfig( + export_type=torch_npu.profiler.ExportType.Text, + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, + msprof_tx=False, + aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone, + l2_cache=False, + op_attr=False, + data_simplification=False, + record_op_args=False, + gc_detect_threshold=None, + ) + + return torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.CPU, + torch_npu.profiler.ProfilerActivity.NPU, + ], + with_stack=envs_vllm.VLLM_TORCH_PROFILER_WITH_STACK, + profile_memory=envs_vllm.\ + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + with_modules=False, + experimental_config=experimental_config, + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir)) + else: + return None + + def get_supported_pooling_tasks(self): + return self.model_runner.get_supported_pooling_tasks() + + def get_supported_tasks(self) -> "tuple[SupportedTask, ...]": + return self.model_runner.get_supported_tasks() + + def take_draft_token_ids(self) -> Optional[DraftTokenIds]: + return self.model_runner.take_draft_token_ids()