diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8d06c75 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,98 @@ +cmake_minimum_required(VERSION 3.16) +project(vllm_ascend_C) + +# include(CheckCXXcompilerFlag) +# check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17) +set(CMAKE_CXX_STANDARD 17) + +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) + +# Suppress potential warnings about unused manually-specified variables +set(ignoreMe "${VLLM_PYTHON_PATH}") + +# TODO: Add 3.12 back when torch-npu support 3.12 +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11") + +find_package(pybind11 REQUIRED) + +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") +set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}") + +find_package(Torch REQUIRED) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION ${SOC_VERSION}) +message(STATUS "Detected SOC version: ${SOC_VERSION}") + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE) +endif() + +if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()") +endif() + +set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH}) +if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.") +endif() + +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) +file(GLOB KERNEL_FILES +${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp) + +ascendc_library(vllm_ascend_kernels SHARED + ${KERNEL_FILES} +) + +message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") + +file(GLOB VLLM_ASCEND_SRC +${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp) + +include_directories( + ${pybind11_INCLUDE_DIRS} + ${PYTHON_INCLUDE_PATH} + ${TORCH_INCLUDE_DIRS} + ${TORCH_NPU_PATH}/include + ${ASCEND_HOME_PATH}/include + ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform + ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform +) + +set( + INCLUDES + ${TORCH_INCLUDE_DIRS} + ${TORCH_NPU_INCLUDE_DIRS} + ${ASCEND_HOME_PATH}/include + ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform +) + +pybind11_add_module(vllm_ascend_C ${VLLM_ASCEND_SRC}) + +target_link_directories( + vllm_ascend_C + PRIVATE + ${TORCH_NPU_PATH}/lib/ + ${ASCEND_HOME_PATH}/lib64 +) + +target_link_libraries( + vllm_ascend_C + PUBLIC + ${TORCH_LIBRARIES} + libtorch_npu.so + vllm_ascend_kernels + ascendcl + platform +) + +target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib") + +install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH}) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..acdb2f7 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,127 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a87fa14 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,3 @@ +# Contributing to vLLM Ascend + +You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally. diff --git a/DCO b/DCO new file mode 100644 index 0000000..49b8cb0 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..29d6445 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.310p b/Dockerfile.310p new file mode 100644 index 0000000..4eb3c63 --- /dev/null +++ b/Dockerfile.310p @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler new file mode 100644 index 0000000..a9d7b34 --- /dev/null +++ b/Dockerfile.310p.openEuler @@ -0,0 +1,59 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + export SOC_VERSION=ASCEND310P3 && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/Dockerfile.a3 b/Dockerfile.a3 new file mode 100644 index 0000000..8bdfb0e --- /dev/null +++ b/Dockerfile.a3 @@ -0,0 +1,60 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ + rm -rf /var/cache/apt/* && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler new file mode 100644 index 0000000..aff585b --- /dev/null +++ b/Dockerfile.a3.openEuler @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler new file mode 100644 index 0000000..47a0c60 --- /dev/null +++ b/Dockerfile.openEuler @@ -0,0 +1,58 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# + +FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11 + +ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" +ARG COMPILE_CUSTOM_KERNELS=1 + +ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} + +RUN yum update -y && \ + yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \ + rm -rf /var/cache/yum + +RUN pip config set global.index-url ${PIP_INDEX_URL} + +WORKDIR /workspace + +COPY . /vllm-workspace/vllm-ascend/ + +# Install vLLM +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_TAG=v0.10.1.1 + +RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm +# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip uninstall -y triton && \ + python3 -m pip cache purge + +# Install vllm-ascend +RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ + source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ + source /usr/local/Ascend/nnal/atb/set_env.sh && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ + export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ + python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ + python3 -m pip cache purge + +# Install modelscope (for fast download) and ray (for multinode) +RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ + python3 -m pip cache purge + +CMD ["/bin/bash"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.en.md b/README.en.md new file mode 100644 index 0000000..72ed323 --- /dev/null +++ b/README.en.md @@ -0,0 +1,91 @@ +
+
+
+| About Ascend | Documentation | #sig-ascend | Users Forum | Weekly Meeting | +
+ + + +--- +*Latest News* 🔥 +- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend. +- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). +- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with LLaMA-Factory/verl//TRL/GPUStack to demonstrate how vLLM Ascend assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios. +- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors. +- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html). +- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF). +- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU. +- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## Overview + +vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU. + +It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM. + +By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU. + +## Prerequisites + +- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental) +- OS: Linux +- Software: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (the same version as vllm-ascend) + +## Getting Started + +Please use the following recommended versions to get started quickly: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details| +|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details| + +## Contributing +See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test. + +We welcome and value any contributions and collaborations: +- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues) +- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help. + +## Branch + +vllm-ascend has main branch and dev branch. + +- **main**: main branch,corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI. +- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version. + +Below is maintained branches: + +| Branch | Status | Note | +|------------|--------------|--------------------------------------| +| main | Maintained | CI commitment for vLLM main branch and vLLM 0.10.x branch | +| v0.7.1-dev | Unmaintained | Only doc fixed is allowed | +| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. | +| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version | +| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration | + +Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details. + +## Weekly Meeting + +- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting +- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15)) + +## License + +Apache License 2.0, as found in the [LICENSE](./LICENSE) file. diff --git a/README.md b/README.md index c00eb9e..8c78bcf 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,34 @@ # enginex-ascend-910-vllm -运行于【昇腾-910】系列算力卡的【文本生成】引擎,基于 vLLM 引擎进行架构特别适配优化,支持 Qwen、DeepSeek、Llama 等最新开源模型 \ No newline at end of file +运行于【昇腾-910】系列算力卡的【文本生成】引擎,基于 vLLM 引擎进行架构特别适配优化,支持 Qwen、DeepSeek、Llama 等最新开源模型 + +## 镜像 + +Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.10.0rc1 + +## 总览 + +vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。 + +此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则:通过解耦的方式提供了vLLM对Ascend NPU的支持。 + +使用 vLLM 昇腾插件,可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。 + +## 准备 + +- 硬件:Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo(实验性支持) +- 操作系统:Linux +- 软件: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (与vllm-ascend版本一致) + +## 开始使用 + +推荐您使用以下版本快速开始使用: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多| +|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多| \ No newline at end of file diff --git a/README.zh.md b/README.zh.md new file mode 100644 index 0000000..d7f1310 --- /dev/null +++ b/README.zh.md @@ -0,0 +1,90 @@ +
+
+
+| 关于昇腾 | 官方文档 | #sig-ascend | 用户论坛 | 社区例会 | +
+ + + +--- +*最新消息* 🔥 + +- [2025/09] 我们发布了新的正式版本 [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! 请按照[官方指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html)开始在Ascend上部署大型专家并行 (EP)。 +- [2025/08] 我们与vLLM和腾讯合作举办了[vLLM北京Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q),!请在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料。 +- [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线!展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例,展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。 +- [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线!所有的贡献都值得被记录,感谢所有的贡献者。 +- [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)!我们与 vLLM 社区合作发布了一篇博客文章,分享了我们的实践:[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。 +- [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料. +- [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库,让vLLM可以无缝运行在Ascend NPU。 +- [2024/12] 我们正在与 vLLM 社区合作,以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## 总览 + +vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。 + +此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则:通过解耦的方式提供了vLLM对Ascend NPU的支持。 + +使用 vLLM 昇腾插件,可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。 + +## 准备 + +- 硬件:Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo(实验性支持) +- 操作系统:Linux +- 软件: + * Python >= 3.9, < 3.12 + * CANN >= 8.2.rc1 + * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724 + * vLLM (与vllm-ascend版本一致) + +## 开始使用 + +推荐您使用以下版本快速开始使用: + +| Version | Release type | Doc | +|------------|--------------|--------------------------------------| +|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多| +|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多| + +## 贡献 +请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。 + +我们欢迎并重视任何形式的贡献与合作: +- 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。 +- 请通过[用户论坛](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support)来交流使用问题和寻求帮助。 + +## 分支策略 +vllm-ascend有主干分支和开发分支。 + +- **main**: 主干分支,与vLLM的主干分支对应,并通过昇腾CI持续进行质量看护。 +- **vX.Y.Z-dev**: 开发分支,随vLLM部分新版本发布而创建,比如`v0.7.3-dev`是vllm-asend针对vLLM `v0.7.3`版本的开发分支。 + +下面是维护中的分支: + +| 分支 | 状态 | 备注 | +|------------|------------|---------------------| +| main | Maintained | 基于vLLM main分支CI看护 | +| v0.7.1-dev | Unmaintained | 只允许文档修复 | +| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复,不会再发布新版本 | +| v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 | +|rfc/feature-name| Maintained | 为协作创建的[特性分支](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) | + +请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html)了解更多详细信息。 + +## 社区例会 + +- vLLM Ascend 每周社区例会: https://tinyurl.com/vllm-ascend-meeting +- 每周三下午,15:00 - 16:00 (UTC+8, [查看您的时区](https://dateful.com/convert/gmt8?t=15)) + +## 许可证 +Apache 许可证 2.0,如 [LICENSE](./LICENSE) 文件中所示。 diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000..64a55cc --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,175 @@ +# Introduction +This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance. + +# Overview +**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon). +- Latency tests + - Input length: 32 tokens. + - Output length: 128 tokens. + - Batch size: fixed (8). + - Models: Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: end-to-end latency (mean, median, p99). + +- Throughput tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm to achieve maximum throughput. + - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: throughput. +- Serving tests + - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). + - Output length: the corresponding output length of these 200 prompts. + - Batch size: dynamically determined by vllm and the arrival pattern of the requests. + - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). + - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B. + - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +**Benchmarking Duration**: about 800 senond for single model. + +# Quick Use +## Prerequisites +Before running the benchmarks, ensure the following: + +- vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices. + +- Install necessary dependencies for benchmarks: + + ```shell + pip install -r benchmarks/requirements-bench.txt + ``` + +- For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time. +- If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example: + + ```shell + [ + { + "test_name": "serving_qwen2_5vl_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "trust_remote_code": "", + "max_model_len": 16384 + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "backend": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + } + ] + ``` + +this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks). + + - **Test Overview** + - Test Name: serving_qwen2_5vl_7B_tp1 + + - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing). + + - Server Parameters + - Model: Qwen/Qwen2.5-VL-7B-Instruct + + - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node) + + - Swap Space: 16 GB (used to handle memory overflow by swapping to disk) + + - disable_log_stats: disables logging of performance statistics. + + - disable_log_requests: disables logging of individual requests. + + - Trust Remote Code: enabled (allows execution of model-specific custom code) + + - Max Model Length: 16,384 tokens (maximum context length supported by the model) + + - Client Parameters + + - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server) + + - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format) + + - Dataset Source: Hugging Face (hf) + + - Dataset Split: train + + - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent) + + - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face) + + - Number of Prompts: 200 (the total number of prompts used during the test) + +## Run benchmarks + +### Use benchmark script +The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory: + +```shell +bash benchmarks/scripts/run-performance-benchmarks.sh +``` + +Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following: + +```shell +. +|-- serving_qwen2_5_7B_tp1_qps_1.json +|-- serving_qwen2_5_7B_tp1_qps_16.json +|-- serving_qwen2_5_7B_tp1_qps_4.json +|-- serving_qwen2_5_7B_tp1_qps_inf.json +|-- latency_qwen2_5_7B_tp1.json +|-- throughput_qwen2_5_7B_tp1.json +``` + +These files contain detailed benchmarking results for further analysis. + +### Use benchmark cli + +For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks +Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example: +#### Online serving +1. Launch the server: + + ```shell + vllm serve Qwen2.5-VL-7B-Instruct --max-model-len 16789 + ``` + +2. Running performance tests using cli + + ```shell + vllm bench serve --model Qwen2.5-VL-7B-Instruct\ + --endpoint-type "openai-chat" --dataset-name hf \ + --hf-split train --endpoint "/v1/chat/completions" \ + --dataset-path "lmarena-ai/vision-arena-bench-v0.1" \ + --num-prompts 200 \ + --request-rate 16 + ``` + +#### Offline +- **Throughput** + + ```shell + vllm bench throughput --output-json results/throughput_qwen2_5_7B_tp1.json \ + --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 --load-format dummy \ + --dataset-path /github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 200 --backend vllm + ``` + +- **Latency** + + ```shell + vllm bench latency --output-json results/latency_qwen2_5_7B_tp1.json \ + --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 \ + --load-format dummy --num-iters-warmup 5 --num-iters 15 + ``` diff --git a/benchmarks/ops/ben_vocabparallelembedding.py b/benchmarks/ops/ben_vocabparallelembedding.py new file mode 100644 index 0000000..b3ef7ec --- /dev/null +++ b/benchmarks/ops/ben_vocabparallelembedding.py @@ -0,0 +1,158 @@ +from typing import Tuple + +import numpy as np +import pytest +import torch +import torch_npu # noqa: F401 +import vllm # noqa: F401 + +import vllm_ascend.platform # noqa: F401 + + +def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50): + """ + Benchmark function for NPU operations + + Args: + fn: Function to benchmark + num_iterations: Number of timing iterations + num_warmup_iterations: Number of warmup iterations + + Returns: + float: Minimum elapsed time in seconds + """ + start = torch.npu.Event(enable_timing=True) + end = torch.npu.Event(enable_timing=True) + times = np.zeros(num_iterations + num_warmup_iterations) + + # Run iterations + for i in range(num_warmup_iterations + num_iterations): + with torch.no_grad(): + start.record() + fn() # Execute the function + end.record() + torch.npu.synchronize() + times[i] = start.elapsed_time(end) + + # Remove warmup iterations and convert to seconds + times = times[num_warmup_iterations:] + elapsed_time = np.amin(times) / 1000 + return elapsed_time + + +def get_masked_input_and_mask_ref( + input_: torch.Tensor, + org_vocab_start_index: int, + org_vocab_end_index: int, + num_org_vocab_padding: int, + added_vocab_start_index: int, + added_vocab_end_index: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Reference implementation for verification""" + org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) + added_vocab_mask = (input_ >= added_vocab_start_index) & ( + input_ < added_vocab_end_index + ) + added_offset = ( + added_vocab_start_index + - (org_vocab_end_index - org_vocab_start_index) + - num_org_vocab_padding + ) + valid_offset = (org_vocab_start_index * org_vocab_mask) + ( + added_offset * added_vocab_mask + ) + vocab_mask = org_vocab_mask | added_vocab_mask + masked_input = vocab_mask * (input_ - valid_offset) + return masked_input, ~vocab_mask + + +DTYPES = [torch.int32] +SHAPES = [(3, 4, 5)] +DEVICES = [f"npu:{0}"] +SEEDS = [0] + + +@pytest.mark.parametrize("shape", SHAPES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("seed", SEEDS) +@torch.inference_mode() +def test_get_masked_input_and_mask( + shape: Tuple[int, ...], + dtype: torch.dtype, + device: str, + seed: int, +) -> None: + # Set random seed and device + torch.manual_seed(seed) + torch.set_default_device(device) + + # Generate random input tensor + input_tensor = torch.randint(0, 1000, shape, dtype=dtype) + + # Test parameters + test_case = { + "org_start": 100, + "org_end": 200, + "padding": 0, + "added_start": 300, + "added_end": 400, + } + + # Define reference function + def ref_fn(): + return get_masked_input_and_mask_ref( + input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"], + ) + + # Define custom function + def custom_fn(): + return torch.ops._C.get_masked_input_and_mask( + input_tensor, + test_case["org_start"], + test_case["org_end"], + test_case["padding"], + test_case["added_start"], + test_case["added_end"], + ) + + # Get results for correctness testing + ref_masked_input, ref_mask = ref_fn() + custom_masked_input, custom_mask = custom_fn() + + # Benchmark both implementations + ref_time = benchmark_npu(ref_fn) + custom_time = benchmark_npu(custom_fn) + + # Print performance results + print("\nPerformance Results:") + print(f"Reference implementation: {ref_time * 1000:.3f} ms") + print(f"Custom implementation: {custom_time * 1000:.3f} ms") + print(f"Speedup: {ref_time / custom_time:.2f}x") + + # Compare results for correctness + ref_masked_input = ref_masked_input.to(dtype) + print("\nResults comparison:") + print("custom_masked_input:", custom_masked_input) + print("ref_masked_input:", ref_masked_input) + print("custom_mask:", custom_mask) + print("ref_mask:", ref_mask) + torch.testing.assert_close( + custom_masked_input, + ref_masked_input, + rtol=1e-5, + atol=1e-5, + msg=f"Masked input mismatch for case: {test_case}", + ) + torch.testing.assert_close( + custom_mask, + ref_mask, + rtol=1e-5, + atol=1e-5, + msg=f"Mask mismatch for case: {test_case}", + ) diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt new file mode 100644 index 0000000..2290823 --- /dev/null +++ b/benchmarks/requirements-bench.txt @@ -0,0 +1,4 @@ +pandas +datasets +modelscope +tabulate \ No newline at end of file diff --git a/benchmarks/scripts/convert_json_to_markdown.py b/benchmarks/scripts/convert_json_to_markdown.py new file mode 100644 index 0000000..1120434 --- /dev/null +++ b/benchmarks/scripts/convert_json_to_markdown.py @@ -0,0 +1,188 @@ +import argparse +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +CUR_PATH = Path(__file__).parent.resolve() +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "avg_latency": "Mean latency (ms)", + "P50": "Median latency (ms)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "num_requests": "Num of reqs", + "total_num_tokens": "Total num of tokens", + "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "request_rate": "Request rate (req/s)", + "request_throughput": "Tput (req/s)", + "output_throughput": "Output Tput (tok/s)", + "median_ttft_ms": "TTFT (ms)", + "median_tpot_ms": "TPOT (ms)", + "median_itl_ms": "ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file) as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps( + { + "latency": latency.to_dict(), + "throughput": throughput.to_dict(), + "serving": serving.to_dict(), + } + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Process the results of the benchmark tests." + ) + parser.add_argument( + "--results_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.", + ) + parser.add_argument( + "--output_folder", + type=str, + default="../results/", + help="The folder where the benchmark results are stored.", + ) + parser.add_argument( + "--markdown_template", + type=str, + default="./perf_result_template.md", + help="The template file for the markdown report.", + ) + parser.add_argument( + "--tag", default="main", help="Tag to be used for release message." + ) + parser.add_argument( + "--commit_id", default="", help="Commit ID to be used for release message." + ) + + args = parser.parse_args() + results_folder = (CUR_PATH / args.results_folder).resolve() + output_folder = (CUR_PATH / args.output_folder).resolve() + markdown_template = (CUR_PATH / args.markdown_template).resolve() + + # collect results + for test_file in results_folder.glob("*.json"): + with open(test_file) as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} + ) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + serving_results.sort(key=lambda x: (len(x["test_name"]), x["test_name"])) + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list(latency_column_mapping.keys())].rename( + columns=latency_column_mapping + ) + if not serving_results.empty: + serving_results = serving_results[list(serving_column_mapping.keys())].rename( + columns=serving_column_mapping + ) + if not throughput_results.empty: + throughput_results = throughput_results[ + list(throughput_results_column_mapping.keys()) + ].rename(columns=throughput_results_column_mapping) + + processed_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) + + # get markdown tables + latency_md_table = tabulate( + latency_results, headers="keys", tablefmt="pipe", showindex=False + ) + serving_md_table = tabulate( + serving_results, headers="keys", tablefmt="pipe", showindex=False + ) + throughput_md_table = tabulate( + throughput_results, headers="keys", tablefmt="pipe", showindex=False + ) + + # document the result + print(output_folder) + with open(output_folder / "benchmark_results.md", "w") as f: + results = read_markdown(markdown_template) + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json, + ) + f.write(results) diff --git a/benchmarks/scripts/perf_result_template.md b/benchmarks/scripts/perf_result_template.md new file mode 100644 index 0000000..cb6a2e6 --- /dev/null +++ b/benchmarks/scripts/perf_result_template.md @@ -0,0 +1,31 @@ +## Online serving tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput, TTFT (median time to the first token ), ITL (median inter-token latency) TPOT(median time per output token). + +{serving_tests_markdown_table} + +## Offline tests +### Latency tests + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: end-to-end latency. + +{latency_tests_markdown_table} + +### Throughput tests + +- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct +- Evaluation metrics: throughput. + +{throughput_tests_markdown_table} diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh new file mode 100644 index 0000000..b604fe9 --- /dev/null +++ b/benchmarks/scripts/run-performance-benchmarks.sh @@ -0,0 +1,321 @@ +#!/bin/bash +set -e + +check_npus() { + # shellcheck disable=SC2155 + declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ') + + if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then + echo "Need at least 1 NPU to run benchmarking." + exit 1 + else + echo "found NPU conut: $npu_count" + fi + + npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}') + + echo "NPU type is: $npu_type" +} + +ensure_sharegpt_downloaded() { + local FILE="/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json" + local DIR + DIR=$(dirname "$FILE") + + if [ ! -f "$FILE" ]; then + echo "$FILE not found, downloading from hf-mirror ..." + mkdir -p "$DIR" + wget -O "$FILE" https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + if [ $? -ne 0 ]; then + echo "Download failed!" >&2 + return 1 + fi + echo "Download completed and saved to $FILE" + else + echo "$FILE already exists." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args + args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + local waited=0 + local timeout_sec=1200 + + while (( waited < timeout_sec )); do + if curl -s -X GET localhost:8000/health > /dev/null; then + return 0 + fi + echo "Waiting for vllm server to start..." + sleep 1 + ((waited++)) + done + + echo "Timeout waiting for server" + return 1 +} + +get_cur_npu_id() { + npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}' +} + +kill_npu_processes() { + ps -aux + lsof -t -i:8000 | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + + sleep 4 + rm -rf ~/.config/vllm + +} + +update_json_field() { + local json_file="$1" + local field_name="$2" + local field_value="$3" + + jq --arg value "$field_value" \ + --arg key "$field_name" \ + '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \ + mv "${json_file}.tmp" "$json_file" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + latency_command="vllm bench latency \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # run the benchmark + eval "$latency_command" + # echo model_name to result file + model_name=$(echo "$latency_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" + kill_npu_processes + + done +} + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + throughput_command="vllm bench throughput \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + + # run the benchmark + eval "$throughput_command" + # echo model_name to result file + model_name=$(echo "$throughput_params" | jq -r '.model') + update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name" + kill_npu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the NPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + bash -c "$server_command" & + server_pid=$! + + # wait until the server is alive + if wait_for_server; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="vllm bench serve \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + bash -c "$client_command" + done + + # clean up + kill -9 $server_pid + kill_npu_processes + done +} + +cleanup() { + rm -rf ./vllm_benchmarks +} + +cleanup_on_error() { + echo "An error occurred. Cleaning up results folder..." + rm -rf $RESULTS_FOLDER +} + +main() { + START_TIME=$(date +%s) + check_npus + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + (which lsof) || (apt-get update && apt-get install -y lsof) + + # get the current IP address, required by benchmark_serving.py + # shellcheck disable=SC2155 + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # set env + export VLLM_USE_MODELSCOPE=True + + # prepare for benchmarking + cd benchmarks || exit 1 + trap cleanup EXIT + + QUICK_BENCHMARK_ROOT=./ + + declare -g RESULTS_FOLDER=results + mkdir -p $RESULTS_FOLDER + + trap cleanup_on_error ERR + ensure_sharegpt_downloaded + # benchmarks + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + END_TIME=$(date +%s) + ELAPSED_TIME=$((END_TIME - START_TIME)) + echo "Total execution time: $ELAPSED_TIME seconds" + +} + +main "$@" diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json new file mode 100644 index 0000000..40cec4c --- /dev/null +++ b/benchmarks/tests/latency-tests.json @@ -0,0 +1,23 @@ +[ + { + "test_name": "latency_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "max_model_len": 16384, + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json new file mode 100644 index 0000000..6398710 --- /dev/null +++ b/benchmarks/tests/serving-tests.json @@ -0,0 +1,77 @@ +[ + { + "test_name": "serving_qwen2_5vl_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "trust_remote_code": "", + "max_model_len": 16384 + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "endpoint_type": "openai-chat", + "dataset_name": "hf", + "hf_split": "train", + "endpoint": "/v1/chat/completions", + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_8B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen2_5_7B_tp1", + "qps_list": [ + 1, + 4, + 16, + "inf" + ], + "server_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "endpoint_type": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000..3698e69 --- /dev/null +++ b/benchmarks/tests/throughput-tests.json @@ -0,0 +1,38 @@ +[ + { + "test_name": "throughput_qwen3_8B_tp1", + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_qwen2_5vl_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-VL-7B-Instruct", + "tensor_parallel_size": 1, + "backend": "vllm-chat", + "dataset_name": "hf", + "hf_split": "train", + "max_model_len": 16384, + "dataset_path": "lmarena-ai/vision-arena-bench-v0.1", + "num_prompts": 200 + } + }, + { + "test_name": "throughput_qwen2_5_7B_tp1", + "parameters": { + "model": "Qwen/Qwen2.5-7B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] + diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 0000000..62078fd --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,133 @@ +# +# Attempt to find the python package that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python_EXECUTABLE ${EXECUTABLE}) + find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule) + if (NOT Python_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${_VER}) is not one of the supported versions: " + "${_SUPPORTED_VERSIONS_LIST}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +function (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(_PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) +endmacro() + + +# This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake +# Define a target named `GPU_MOD_NAME` for a single extension. The +# arguments are: +# +# DESTINATION