v0.10.1rc1

2025-09-09 09:40:35 +08:00
parent d6f6ef41fe
commit 9149384e03
432 changed files with 84698 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,98 @@
 cmake_minimum_required(VERSION 3.16)
 project(vllm_ascend_C)
 # include(CheckCXXcompilerFlag)
 # check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17)
 set(CMAKE_CXX_STANDARD 17)
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 # Suppress potential warnings about unused manually-specified variables
 set(ignoreMe "${VLLM_PYTHON_PATH}")
 # TODO: Add 3.12 back when torch-npu support 3.12
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11")
 find_package(pybind11 REQUIRED)
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
 find_package(Torch REQUIRED)
 set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
 set(SOC_VERSION ${SOC_VERSION})
 message(STATUS "Detected SOC version: ${SOC_VERSION}")
 if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
 endif()
 if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()")
 endif()
 set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
 if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
 elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
 elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
 else()
    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 file(GLOB KERNEL_FILES
 ${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)
 ascendc_library(vllm_ascend_kernels SHARED
    ${KERNEL_FILES}
 )
 message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
 file(GLOB VLLM_ASCEND_SRC
 ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
 include_directories(
  ${pybind11_INCLUDE_DIRS}
  ${PYTHON_INCLUDE_PATH}
  ${TORCH_INCLUDE_DIRS}
  ${TORCH_NPU_PATH}/include
  ${ASCEND_HOME_PATH}/include
  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
 )
 set(
  INCLUDES
  ${TORCH_INCLUDE_DIRS}
  ${TORCH_NPU_INCLUDE_DIRS}
  ${ASCEND_HOME_PATH}/include
  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
 )
 pybind11_add_module(vllm_ascend_C ${VLLM_ASCEND_SRC})
 target_link_directories(
  vllm_ascend_C
  PRIVATE
  ${TORCH_NPU_PATH}/lib/
  ${ASCEND_HOME_PATH}/lib64
 )
 target_link_libraries(
  vllm_ascend_C
  PUBLIC
  ${TORCH_LIBRARIES}
  libtorch_npu.so
  vllm_ascend_kernels
  ascendcl
  platform
 )
 target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib")
 install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH})
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,127 @@
 # vLLM Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socioeconomic status,
 nationality, personal appearance, race, caste, color, religion, or sexual
 identity and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 * Demonstrating empathy and kindness toward other people
 * Being respectful of differing opinions, viewpoints, and experiences
 * Giving and gracefully accepting constructive feedback
 * Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 * Focusing on what is best not just for us as individuals, but for the overall
  community
 Examples of unacceptable behavior include:
 * The use of sexualized language or imagery, and sexual attention or advances of
  any kind
 * Trolling, insulting or derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or email address,
  without their explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official email address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline/IRL event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
 channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series of
 actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or permanent
 ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior, harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
 version 2.1, available at
 [v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
 Community Impact Guidelines were inspired by
 [Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,3 @@
 # Contributing to vLLM Ascend
 You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally.
--- a/34
+++ b/34
@@ -0,0 +1,34 @@
 Developer Certificate of Origin
 Version 1.1
 Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 Everyone is permitted to copy and distribute verbatim copies of this
 license document, but changing it is not allowed.
 Developer's Certificate of Origin 1.1
 By making a contribution to this project, I certify that:
 (a) The contribution was created in whole or in part by me and I
    have the right to submit it under the open source license
    indicated in the file; or
 (b) The contribution is based upon previous work that, to the best
    of my knowledge, is covered under an appropriate open source
    license and I have the right under that license to submit that
    work with modifications, whether created in whole or in part
    by me, under the same open source license (unless I am
    permitted to submit under a different license), as indicated
    in the file; or
 (c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.
 (d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including all
    personal information I submit with it, including my sign-off) is
    maintained indefinitely and may be redistributed consistent with
    this project or the open source license(s) involved.
--- a/60
+++ b/60
@@ -0,0 +1,60 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -0,0 +1,61 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export SOC_VERSION=ASCEND310P3 && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -0,0 +1,59 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN yum update -y && \
    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
    rm -rf /var/cache/yum
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
    export SOC_VERSION=ASCEND310P3 && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -0,0 +1,60 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN apt-get update -y && \
    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -0,0 +1,58 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN yum update -y && \
    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
    rm -rf /var/cache/yum
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -0,0 +1,58 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
 RUN yum update -y && \
    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
    rm -rf /var/cache/yum
 RUN pip config set global.index-url ${PIP_INDEX_URL}
 WORKDIR /workspace
 COPY . /vllm-workspace/vllm-ascend/
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 ARG VLLM_TAG=v0.10.1.1
 RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip uninstall -y triton && \
    python3 -m pip cache purge
 # Install vllm-ascend
 RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
 CMD ["/bin/bash"]
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.en.md
+++ b/README.en.md
@@ -0,0 +1,91 @@
 <p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
  </picture>
 </p>
 <h3 align="center">
 vLLM Ascend Plugin
 </h3>
 <p align="center">
 | <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
 </p>
 <p align="center">
 <a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
 </p>
 ---
 *Latest News* 🔥
 - [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
 - [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
 - [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
 - [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
 - [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
 - [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
 - [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
 - [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
 ---
 ## Overview
 vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
 It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
 By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
 ## Prerequisites
 - Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
 - OS: Linux
 - Software:
  * Python >= 3.9, < 3.12
  * CANN >= 8.2.rc1
  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
  * vLLM (the same version as vllm-ascend)
 ## Getting Started
 Please use the following recommended versions to get started quickly:
 | Version    | Release type | Doc                                  |
 |------------|--------------|--------------------------------------|
 |v0.10.1rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
 |v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
 ## Contributing
 See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
 We welcome and value any contributions and collaborations:
 - Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
 - Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
 ## Branch
 vllm-ascend has main branch and dev branch.
 - **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
 - **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
 Below is maintained branches:
 | Branch     | Status       | Note                                 |
 |------------|--------------|--------------------------------------|
 | main       | Maintained   | CI commitment for vLLM main branch and vLLM 0.10.x branch   |
 | v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
 | v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
 | v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
 | rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
 Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
 ## Weekly Meeting
 - vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
 - Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
 ## License
 Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
--- a/README.md
+++ b/README.md
@@ -1,3 +1,34 @@
 # enginex-ascend-910-vllm
-运行于【昇腾-910】系列算力卡的【文本生成】引擎，基于 vLLM 引擎进行架构特别适配优化，支持 Qwen、DeepSeek、Llama 等最新开源模型
+运行于【昇腾-910】系列算力卡的【文本生成】引擎，基于 vLLM 引擎进行架构特别适配优化，支持 Qwen、DeepSeek、Llama 等最新开源模型
 ## 镜像
 Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.10.0rc1
 ## 总览
 vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。
 此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则：通过解耦的方式提供了vLLM对Ascend NPU的支持。
 使用 vLLM 昇腾插件，可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。
 ## 准备
 - 硬件：Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo（实验性支持）
 - 操作系统：Linux
 - 软件：
  * Python >= 3.9, < 3.12
  * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
  * vLLM (与vllm-ascend版本一致)
 ## 开始使用
 推荐您使用以下版本快速开始使用：
 | Version    | Release type | Doc                                  |
 |------------|--------------|--------------------------------------|
 |v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
 |v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
--- a/README.zh.md
+++ b/README.zh.md
@@ -0,0 +1,90 @@
 <p align="center">
  <picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
  </picture>
 </p>
 <h3 align="center">
 vLLM Ascend Plugin
 </h3>
 <p align="center">
 | <a href="https://www.hiascend.com/en/"><b>关于昇腾</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>官方文档</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>用户论坛</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>社区例会</b></a> |
 </p>
 <p align="center">
 <a href="README.md"><b>English</b></a> | <a><b>中文</b></a>
 </p>
 ---
 *最新消息* 🔥
 - [2025/09] 我们发布了新的正式版本 [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! 请按照[官方指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html)开始在Ascend上部署大型专家并行 (EP)。
 - [2025/08] 我们与vLLM和腾讯合作举办了[vLLM北京Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q)，！请在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料。
 - [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线！展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例，展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。
 - [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线！所有的贡献都值得被记录，感谢所有的贡献者。
 - [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)！我们与 vLLM 社区合作发布了一篇博客文章，分享了我们的实践：[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。
 - [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料.
 - [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库，让vLLM可以无缝运行在Ascend NPU。
 - [2024/12] 我们正在与 vLLM 社区合作，以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
 ---
 ## 总览
 vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。
 此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则：通过解耦的方式提供了vLLM对Ascend NPU的支持。
 使用 vLLM 昇腾插件，可以让类Transformer、混合专家(MOE)、嵌入、多模态等流行的大语言模型在 Ascend NPU 上无缝运行。
 ## 准备
 - 硬件：Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo（实验性支持）
 - 操作系统：Linux
 - 软件：
  * Python >= 3.9, < 3.12
  * CANN >= 8.2.rc1
  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
  * vLLM (与vllm-ascend版本一致)
 ## 开始使用
 推荐您使用以下版本快速开始使用：
 | Version    | Release type | Doc                                  |
 |------------|--------------|--------------------------------------|
 |v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
 |v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
 ## 贡献
 请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。
 我们欢迎并重视任何形式的贡献与合作：
 - 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。
 - 请通过[用户论坛](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support)来交流使用问题和寻求帮助。
 ## 分支策略
 vllm-ascend有主干分支和开发分支。
 - **main**: 主干分支，与vLLM的主干分支对应，并通过昇腾CI持续进行质量看护。
 - **vX.Y.Z-dev**: 开发分支，随vLLM部分新版本发布而创建，比如`v0.7.3-dev`是vllm-asend针对vLLM `v0.7.3`版本的开发分支。
 下面是维护中的分支：
 | 分支         | 状态         | 备注                  |
 |------------|------------|---------------------|
 | main       | Maintained | 基于vLLM main分支CI看护   |
 | v0.7.1-dev | Unmaintained | 只允许文档修复 |
 | v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复，不会再发布新版本 |
 | v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 |
 |rfc/feature-name| Maintained | 为协作创建的[特性分支](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) |
 请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html)了解更多详细信息。
 ## 社区例会
 - vLLM Ascend 每周社区例会: https://tinyurl.com/vllm-ascend-meeting
 - 每周三下午，15:00 - 16:00 (UTC+8, [查看您的时区](https://dateful.com/convert/gmt8?t=15))
 ## 许可证
 Apache 许可证 2.0，如 [LICENSE](./LICENSE) 文件中所示。
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -0,0 +1,175 @@
 # Introduction
 This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance.
 # Overview
 **Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon).
 - Latency tests
    - Input length: 32 tokens.
    - Output length: 128 tokens.
    - Batch size: fixed (8).
    - Models: Qwen2.5-7B-Instruct, Qwen3-8B.
    - Evaluation metrics: end-to-end latency (mean, median, p99).
 - Throughput tests
    - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
    - Output length: the corresponding output length of these 200 prompts.
    - Batch size: dynamically determined by vllm to achieve maximum throughput.
    - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
    - Evaluation metrics: throughput.
 - Serving tests
    - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
    - Output length: the corresponding output length of these 200 prompts.
    - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
    - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
    - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
    - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 **Benchmarking Duration**: about 800 senond for single model.
 # Quick Use
 ## Prerequisites
 Before running the benchmarks, ensure the following:
 - vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices.
 - Install necessary dependencies for benchmarks:
  ```shell
  pip install -r benchmarks/requirements-bench.txt
  ```
 - For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time.
 - If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example:
  ```shell
  [
  {
    "test_name": "serving_qwen2_5vl_7B_tp1",
    "qps_list": [
      1,
      4,
      16,
      "inf"
    ],
    "server_parameters": {
      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
      "tensor_parallel_size": 1,
      "swap_space": 16,
      "disable_log_stats": "",
      "disable_log_requests": "",
      "trust_remote_code": "",
      "max_model_len": 16384
    },
    "client_parameters": {
      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
      "backend": "openai-chat",
      "dataset_name": "hf",
      "hf_split": "train",
      "endpoint": "/v1/chat/completions",
      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
      "num_prompts": 200
    }
  }
  ]
  ```
 this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks).
  - **Test Overview**
     - Test Name: serving_qwen2_5vl_7B_tp1
     - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing).
  - Server Parameters
     - Model: Qwen/Qwen2.5-VL-7B-Instruct
     - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node)
     - Swap Space: 16 GB (used to handle memory overflow by swapping to disk)
     - disable_log_stats: disables logging of performance statistics.
     - disable_log_requests: disables logging of individual requests.
     - Trust Remote Code: enabled (allows execution of model-specific custom code)
     - Max Model Length: 16,384 tokens (maximum context length supported by the model)
  - Client Parameters
     - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server)
     - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format)
     - Dataset Source: Hugging Face (hf)
     - Dataset Split: train
     - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent)
     - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face)
     - Number of Prompts: 200 (the total number of prompts used during the test)
 ## Run benchmarks
 ### Use benchmark script
 The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory:
 ```shell
 bash benchmarks/scripts/run-performance-benchmarks.sh
 ```
 Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following:
 ```shell
 .
 |-- serving_qwen2_5_7B_tp1_qps_1.json
 |-- serving_qwen2_5_7B_tp1_qps_16.json
 |-- serving_qwen2_5_7B_tp1_qps_4.json
 |-- serving_qwen2_5_7B_tp1_qps_inf.json
 |-- latency_qwen2_5_7B_tp1.json
 |-- throughput_qwen2_5_7B_tp1.json
 ```
 These files contain detailed benchmarking results for further analysis.
 ### Use benchmark cli
 For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks
 Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example:
 #### Online serving
 1. Launch the server:
    ```shell
    vllm serve Qwen2.5-VL-7B-Instruct --max-model-len 16789
    ```
 2. Running performance tests using cli
    ```shell
    vllm bench serve --model Qwen2.5-VL-7B-Instruct\
    --endpoint-type "openai-chat" --dataset-name hf \
    --hf-split train --endpoint "/v1/chat/completions" \
    --dataset-path "lmarena-ai/vision-arena-bench-v0.1" \
    --num-prompts 200 \
    --request-rate 16
    ```
 #### Offline
 - **Throughput**
  ```shell
  vllm bench throughput --output-json results/throughput_qwen2_5_7B_tp1.json \
  --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 --load-format dummy \
  --dataset-path /github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \
  --num-prompts 200 --backend vllm
  ```
 - **Latency**
  ```shell
  vllm bench latency --output-json results/latency_qwen2_5_7B_tp1.json \
  --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 \
  --load-format dummy --num-iters-warmup 5 --num-iters 15
  ```
--- a/benchmarks/ops/ben_vocabparallelembedding.py
+++ b/benchmarks/ops/ben_vocabparallelembedding.py
@@ -0,0 +1,158 @@
 from typing import Tuple
 import numpy as np
 import pytest
 import torch
 import torch_npu  # noqa: F401
 import vllm  # noqa: F401
 import vllm_ascend.platform  # noqa: F401
 def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50):
    """
    Benchmark function for NPU operations
    Args:
        fn: Function to benchmark
        num_iterations: Number of timing iterations
        num_warmup_iterations: Number of warmup iterations
    Returns:
        float: Minimum elapsed time in seconds
    """
    start = torch.npu.Event(enable_timing=True)
    end = torch.npu.Event(enable_timing=True)
    times = np.zeros(num_iterations + num_warmup_iterations)
    # Run iterations
    for i in range(num_warmup_iterations + num_iterations):
        with torch.no_grad():
            start.record()
            fn()  # Execute the function
            end.record()
        torch.npu.synchronize()
        times[i] = start.elapsed_time(end)
    # Remove warmup iterations and convert to seconds
    times = times[num_warmup_iterations:]
    elapsed_time = np.amin(times) / 1000
    return elapsed_time
 def get_masked_input_and_mask_ref(
    input_: torch.Tensor,
    org_vocab_start_index: int,
    org_vocab_end_index: int,
    num_org_vocab_padding: int,
    added_vocab_start_index: int,
    added_vocab_end_index: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """Reference implementation for verification"""
    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
    added_vocab_mask = (input_ >= added_vocab_start_index) & (
        input_ < added_vocab_end_index
    )
    added_offset = (
        added_vocab_start_index
        - (org_vocab_end_index - org_vocab_start_index)
        - num_org_vocab_padding
    )
    valid_offset = (org_vocab_start_index * org_vocab_mask) + (
        added_offset * added_vocab_mask
    )
    vocab_mask = org_vocab_mask | added_vocab_mask
    masked_input = vocab_mask * (input_ - valid_offset)
    return masked_input, ~vocab_mask
 DTYPES = [torch.int32]
 SHAPES = [(3, 4, 5)]
 DEVICES = [f"npu:{0}"]
 SEEDS = [0]
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
 def test_get_masked_input_and_mask(
    shape: Tuple[int, ...],
    dtype: torch.dtype,
    device: str,
    seed: int,
 ) -> None:
    # Set random seed and device
    torch.manual_seed(seed)
    torch.set_default_device(device)
    # Generate random input tensor
    input_tensor = torch.randint(0, 1000, shape, dtype=dtype)
    # Test parameters
    test_case = {
        "org_start": 100,
        "org_end": 200,
        "padding": 0,
        "added_start": 300,
        "added_end": 400,
    }
    # Define reference function
    def ref_fn():
        return get_masked_input_and_mask_ref(
            input_tensor,
            test_case["org_start"],
            test_case["org_end"],
            test_case["padding"],
            test_case["added_start"],
            test_case["added_end"],
        )
    # Define custom function
    def custom_fn():
        return torch.ops._C.get_masked_input_and_mask(
            input_tensor,
            test_case["org_start"],
            test_case["org_end"],
            test_case["padding"],
            test_case["added_start"],
            test_case["added_end"],
        )
    # Get results for correctness testing
    ref_masked_input, ref_mask = ref_fn()
    custom_masked_input, custom_mask = custom_fn()
    # Benchmark both implementations
    ref_time = benchmark_npu(ref_fn)
    custom_time = benchmark_npu(custom_fn)
    # Print performance results
    print("\nPerformance Results:")
    print(f"Reference implementation: {ref_time * 1000:.3f} ms")
    print(f"Custom implementation: {custom_time * 1000:.3f} ms")
    print(f"Speedup: {ref_time / custom_time:.2f}x")
    # Compare results for correctness
    ref_masked_input = ref_masked_input.to(dtype)
    print("\nResults comparison:")
    print("custom_masked_input:", custom_masked_input)
    print("ref_masked_input:", ref_masked_input)
    print("custom_mask:", custom_mask)
    print("ref_mask:", ref_mask)
    torch.testing.assert_close(
        custom_masked_input,
        ref_masked_input,
        rtol=1e-5,
        atol=1e-5,
        msg=f"Masked input mismatch for case: {test_case}",
    )
    torch.testing.assert_close(
        custom_mask,
        ref_mask,
        rtol=1e-5,
        atol=1e-5,
        msg=f"Mask mismatch for case: {test_case}",
    )
--- a/benchmarks/requirements-bench.txt
+++ b/benchmarks/requirements-bench.txt
@@ -0,0 +1,4 @@
 pandas
 datasets
 modelscope
 tabulate
--- a/benchmarks/scripts/convert_json_to_markdown.py
+++ b/benchmarks/scripts/convert_json_to_markdown.py
@@ -0,0 +1,188 @@
 import argparse
 import json
 import os
 from pathlib import Path
 import pandas as pd
 from tabulate import tabulate
 CUR_PATH = Path(__file__).parent.resolve()
 # latency results and the keys that will be printed into markdown
 latency_results = []
 latency_column_mapping = {
    "test_name": "Test name",
    "avg_latency": "Mean latency (ms)",
    "P50": "Median latency (ms)",
    "P99": "P99 latency (ms)",
 }
 # throughput tests and the keys that will be printed into markdown
 throughput_results = []
 throughput_results_column_mapping = {
    "test_name": "Test name",
    "num_requests": "Num of reqs",
    "total_num_tokens": "Total num of tokens",
    "elapsed_time": "Elapsed time (s)",
    "requests_per_second": "Tput (req/s)",
    "tokens_per_second": "Tput (tok/s)",
 }
 # serving results and the keys that will be printed into markdown
 serving_results = []
 serving_column_mapping = {
    "test_name": "Test name",
    "request_rate": "Request rate (req/s)",
    "request_throughput": "Tput (req/s)",
    "output_throughput": "Output Tput (tok/s)",
    "median_ttft_ms": "TTFT (ms)",
    "median_tpot_ms": "TPOT (ms)",
    "median_itl_ms": "ITL (ms)",
 }
 def read_markdown(file):
    if os.path.exists(file):
        with open(file) as f:
            return f.read() + "\n"
    else:
        return f"{file} not found.\n"
 def results_to_json(latency, throughput, serving):
    return json.dumps(
        {
            "latency": latency.to_dict(),
            "throughput": throughput.to_dict(),
            "serving": serving.to_dict(),
        }
    )
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Process the results of the benchmark tests."
    )
    parser.add_argument(
        "--results_folder",
        type=str,
        default="../results/",
        help="The folder where the benchmark results are stored.",
    )
    parser.add_argument(
        "--output_folder",
        type=str,
        default="../results/",
        help="The folder where the benchmark results are stored.",
    )
    parser.add_argument(
        "--markdown_template",
        type=str,
        default="./perf_result_template.md",
        help="The template file for the markdown report.",
    )
    parser.add_argument(
        "--tag", default="main", help="Tag to be used for release message."
    )
    parser.add_argument(
        "--commit_id", default="", help="Commit ID to be used for release message."
    )
    args = parser.parse_args()
    results_folder = (CUR_PATH / args.results_folder).resolve()
    output_folder = (CUR_PATH / args.output_folder).resolve()
    markdown_template = (CUR_PATH / args.markdown_template).resolve()
    # collect results
    for test_file in results_folder.glob("*.json"):
        with open(test_file) as f:
            raw_result = json.loads(f.read())
        if "serving" in str(test_file):
            # this result is generated via `benchmark_serving.py`
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            serving_results.append(raw_result)
            continue
        elif "latency" in f.name:
            # this result is generated via `benchmark_latency.py`
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # get different percentiles
            for perc in [10, 25, 50, 75, 90, 99]:
                # Multiply 1000 to convert the time unit from s to ms
                raw_result.update(
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
                )
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
            # add the result to raw_result
            latency_results.append(raw_result)
            continue
        elif "throughput" in f.name:
            # this result is generated via `benchmark_throughput.py`
            # update the test name of this result
            raw_result.update({"test_name": test_file.stem})
            # add the result to raw_result
            throughput_results.append(raw_result)
            continue
        print(f"Skipping {test_file}")
    serving_results.sort(key=lambda x: (len(x["test_name"]), x["test_name"]))
    latency_results = pd.DataFrame.from_dict(latency_results)
    serving_results = pd.DataFrame.from_dict(serving_results)
    throughput_results = pd.DataFrame.from_dict(throughput_results)
    raw_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    # remapping the key, for visualization purpose
    if not latency_results.empty:
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
            columns=latency_column_mapping
        )
    if not serving_results.empty:
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
            columns=serving_column_mapping
        )
    if not throughput_results.empty:
        throughput_results = throughput_results[
            list(throughput_results_column_mapping.keys())
        ].rename(columns=throughput_results_column_mapping)
    processed_results_json = results_to_json(
        latency_results, throughput_results, serving_results
    )
    # get markdown tables
    latency_md_table = tabulate(
        latency_results, headers="keys", tablefmt="pipe", showindex=False
    )
    serving_md_table = tabulate(
        serving_results, headers="keys", tablefmt="pipe", showindex=False
    )
    throughput_md_table = tabulate(
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
    )
    # document the result
    print(output_folder)
    with open(output_folder / "benchmark_results.md", "w") as f:
        results = read_markdown(markdown_template)
        results = results.format(
            latency_tests_markdown_table=latency_md_table,
            throughput_tests_markdown_table=throughput_md_table,
            serving_tests_markdown_table=serving_md_table,
            benchmarking_results_in_json_string=processed_results_json,
        )
        f.write(results)
--- a/benchmarks/scripts/perf_result_template.md
+++ b/benchmarks/scripts/perf_result_template.md
@@ -0,0 +1,31 @@
 ## Online serving tests
 - Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
 - Evaluation metrics: throughput, TTFT (median time to the first token ), ITL (median inter-token latency) TPOT(median time per output token).
 {serving_tests_markdown_table}
 ## Offline tests
 ### Latency tests
 - Input length: 32 tokens.
 - Output length: 128 tokens.
 - Batch size: fixed (8).
 - Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
 - Evaluation metrics: end-to-end latency.
 {latency_tests_markdown_table}
 ### Throughput tests
 - Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed).
 - Output length: the corresponding output length of these 200 prompts.
 - Batch size: dynamically determined by vllm to achieve maximum throughput.
 - Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
 - Evaluation metrics: throughput.
 {throughput_tests_markdown_table}
--- a/benchmarks/scripts/run-performance-benchmarks.sh
+++ b/benchmarks/scripts/run-performance-benchmarks.sh
@@ -0,0 +1,321 @@
 #!/bin/bash
 set -e
 check_npus() {
  # shellcheck disable=SC2155
  declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ')
  if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then
    echo "Need at least 1 NPU to run benchmarking."
    exit 1
  else
    echo "found NPU conut: $npu_count"
  fi
  npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}')
  echo "NPU type is: $npu_type"
 }
 ensure_sharegpt_downloaded() {
  local FILE="/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
  local DIR
  DIR=$(dirname "$FILE")
  if [ ! -f "$FILE" ]; then
    echo "$FILE not found, downloading from hf-mirror ..."
    mkdir -p "$DIR"
    wget -O "$FILE" https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
    if [ $? -ne 0 ]; then
      echo "Download failed!" >&2
      return 1
    fi
    echo "Download completed and saved to $FILE"
  else
    echo "$FILE already exists."
  fi
 }
 json2args() {
  # transforms the JSON string to command line args, and '_' is replaced to '-'
  # example:
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
  local json_string=$1
  local args
  args=$(
    echo "$json_string" | jq -r '
      to_entries |
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
      join(" ")
    '
  )
  echo "$args"
 }
 wait_for_server() {
  local waited=0
  local timeout_sec=1200
  while (( waited < timeout_sec )); do
    if curl -s -X GET localhost:8000/health > /dev/null; then
      return 0
    fi
    echo "Waiting for vllm server to start..."
    sleep 1
    ((waited++))
  done
  echo "Timeout waiting for server"
  return 1
 }
 get_cur_npu_id() {
    npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}'
 }
 kill_npu_processes() {
  ps -aux
  lsof -t -i:8000 | xargs -r kill -9
  pgrep python3 | xargs -r kill -9
  sleep 4
  rm -rf ~/.config/vllm
 }
 update_json_field() {
  local json_file="$1"
  local field_name="$2"
  local field_value="$3"
  jq --arg value "$field_value" \
     --arg key "$field_name" \
     '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \
     mv "${json_file}.tmp" "$json_file"
 }
 run_latency_tests() {
  # run latency tests using `benchmark_latency.py`
  # $1: a json file specifying latency test cases
  local latency_test_file
  latency_test_file=$1
  # Iterate over latency tests
  jq -c '.[]' "$latency_test_file" | while read -r params; do
    # get the test name, and append the NPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^latency_ ]]; then
      echo "In latency-test.json, test_name must start with \"latency_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    latency_params=$(echo "$params" | jq -r '.parameters')
    latency_args=$(json2args "$latency_params")
    latency_command="vllm bench latency \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $latency_args"
    echo "Running test case $test_name"
    echo "Latency command: $latency_command"
    # run the benchmark
    eval "$latency_command"
    # echo model_name to result file
    model_name=$(echo "$latency_params" | jq -r '.model')
    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
    kill_npu_processes
  done
 }
 run_throughput_tests() {
  # run throughput tests using `benchmark_throughput.py`
  # $1: a json file specifying throughput test cases
  local throughput_test_file
  throughput_test_file=$1
  # Iterate over throughput tests
  jq -c '.[]' "$throughput_test_file" | while read -r params; do
    # get the test name, and append the NPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^throughput_ ]]; then
      echo "In throughput-test.json, test_name must start with \"throughput_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get arguments
    throughput_params=$(echo "$params" | jq -r '.parameters')
    throughput_args=$(json2args "$throughput_params")
    throughput_command="vllm bench throughput \
      --output-json $RESULTS_FOLDER/${test_name}.json \
      $throughput_args"
    echo "Running test case $test_name"
    echo "Throughput command: $throughput_command"
    # run the benchmark
    eval "$throughput_command"
    # echo model_name to result file
    model_name=$(echo "$throughput_params" | jq -r '.model')
    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
    kill_npu_processes
  done
 }
 run_serving_tests() {
  # run serving tests using `benchmark_serving.py`
  # $1: a json file specifying serving test cases
  local serving_test_file
  serving_test_file=$1
  # Iterate over serving tests
  jq -c '.[]' "$serving_test_file" | while read -r params; do
    # get the test name, and append the NPU type back to it.
    test_name=$(echo "$params" | jq -r '.test_name')
    if [[ ! "$test_name" =~ ^serving_ ]]; then
      echo "In serving-test.json, test_name must start with \"serving_\"."
      exit 1
    fi
    # if TEST_SELECTOR is set, only run the test cases that match the selector
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
      echo "Skip test case $test_name."
      continue
    fi
    # get client and server arguments
    server_params=$(echo "$params" | jq -r '.server_parameters')
    client_params=$(echo "$params" | jq -r '.client_parameters')
    server_args=$(json2args "$server_params")
    client_args=$(json2args "$client_params")
    qps_list=$(echo "$params" | jq -r '.qps_list')
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
    echo "Running over qps list $qps_list"
    # check if server model and client model is aligned
    server_model=$(echo "$server_params" | jq -r '.model')
    client_model=$(echo "$client_params" | jq -r '.model')
    if [[ $server_model != "$client_model" ]]; then
      echo "Server model and client model must be the same. Skip testcase $test_name."
      continue
    fi
    server_command="python3 \
      -m vllm.entrypoints.openai.api_server \
      $server_args"
    # run the server
    echo "Running test case $test_name"
    echo "Server command: $server_command"
    bash -c "$server_command" &
    server_pid=$!
    # wait until the server is alive
    if wait_for_server; then
      echo ""
      echo "vllm server is up and running."
    else
      echo ""
      echo "vllm failed to start within the timeout period."
    fi
    # iterate over different QPS
    for qps in $qps_list; do
      # remove the surrounding single quote from qps
      if [[ "$qps" == *"inf"* ]]; then
        echo "qps was $qps"
        qps="inf"
        echo "now qps is $qps"
      fi
      new_test_name=$test_name"_qps_"$qps
      client_command="vllm bench serve \
        --save-result \
        --result-dir $RESULTS_FOLDER \
        --result-filename ${new_test_name}.json \
        --request-rate $qps \
        $client_args"
      echo "Running test case $test_name with qps $qps"
      echo "Client command: $client_command"
      bash -c "$client_command"
    done
    # clean up
    kill -9 $server_pid
    kill_npu_processes
  done
 }
 cleanup() {
  rm -rf ./vllm_benchmarks
 }
 cleanup_on_error() {
  echo "An error occurred. Cleaning up results folder..."
  rm -rf $RESULTS_FOLDER
 }
 main() {
  START_TIME=$(date +%s)
  check_npus
  # dependencies
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
  (which jq) || (apt-get update && apt-get -y install jq)
  (which lsof) || (apt-get update && apt-get install -y lsof)
  # get the current IP address, required by benchmark_serving.py
  # shellcheck disable=SC2155
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
  # turn of the reporting of the status of each request, to clean up the terminal output
  export VLLM_LOG_LEVEL="WARNING"
  # set env
  export VLLM_USE_MODELSCOPE=True
  # prepare for benchmarking
  cd benchmarks || exit 1
  trap cleanup EXIT
  QUICK_BENCHMARK_ROOT=./
  declare -g RESULTS_FOLDER=results
  mkdir -p $RESULTS_FOLDER
  trap cleanup_on_error ERR
  ensure_sharegpt_downloaded
  # benchmarks
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
  END_TIME=$(date +%s)
  ELAPSED_TIME=$((END_TIME - START_TIME))
  echo "Total execution time: $ELAPSED_TIME seconds"
 }
 main "$@"
--- a/benchmarks/tests/latency-tests.json
+++ b/benchmarks/tests/latency-tests.json
@@ -0,0 +1,23 @@
 [
  {
    "test_name": "latency_qwen3_8B_tp1",
    "parameters": {
      "model": "Qwen/Qwen3-8B",
      "tensor_parallel_size": 1,
      "load_format": "dummy",
      "max_model_len": 16384,
      "num_iters_warmup": 5,
      "num_iters": 15
    }
  },
  {
    "test_name": "latency_qwen2_5_7B_tp1",
    "parameters": {
      "model": "Qwen/Qwen2.5-7B-Instruct",
      "tensor_parallel_size": 1,
      "load_format": "dummy",
      "num_iters_warmup": 5,
      "num_iters": 15
    }
  }
 ]
--- a/benchmarks/tests/serving-tests.json
+++ b/benchmarks/tests/serving-tests.json
@@ -0,0 +1,77 @@
 [
  {
    "test_name": "serving_qwen2_5vl_7B_tp1",
    "qps_list": [
      1,
      4,
      16,
      "inf"
    ],
    "server_parameters": {
      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
      "tensor_parallel_size": 1,
      "swap_space": 16,
      "disable_log_stats": "",
      "disable_log_requests": "",
      "trust_remote_code": "",
      "max_model_len": 16384
    },
    "client_parameters": {
      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
      "endpoint_type": "openai-chat",
      "dataset_name": "hf",
      "hf_split": "train",
      "endpoint": "/v1/chat/completions",
      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
      "num_prompts": 200
    }
  },
  {
    "test_name": "serving_qwen3_8B_tp1",
    "qps_list": [
      1,
      4,
      16,
      "inf"
    ],
    "server_parameters": {
      "model": "Qwen/Qwen3-8B",
      "tensor_parallel_size": 1,
      "swap_space": 16,
      "disable_log_stats": "",
      "disable_log_requests": "",
      "load_format": "dummy"
    },
    "client_parameters": {
      "model": "Qwen/Qwen3-8B",
      "endpoint_type": "vllm",
      "dataset_name": "sharegpt",
      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200
    }
  },
  {
    "test_name": "serving_qwen2_5_7B_tp1",
    "qps_list": [
      1,
      4,
      16,
      "inf"
    ],
    "server_parameters": {
      "model": "Qwen/Qwen2.5-7B-Instruct",
      "tensor_parallel_size": 1,
      "swap_space": 16,
      "disable_log_stats": "",
      "disable_log_requests": "",
      "load_format": "dummy"
    },
    "client_parameters": {
      "model": "Qwen/Qwen2.5-7B-Instruct",
      "endpoint_type": "vllm",
      "dataset_name": "sharegpt",
      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200
    }
  }
 ]
--- a/benchmarks/tests/throughput-tests.json
+++ b/benchmarks/tests/throughput-tests.json
@@ -0,0 +1,38 @@
 [
  {
    "test_name": "throughput_qwen3_8B_tp1",
    "parameters": {
      "model": "Qwen/Qwen3-8B",
      "tensor_parallel_size": 1,
      "load_format": "dummy",
      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200,
      "backend": "vllm"
    }
  },
  {
    "test_name": "throughput_qwen2_5vl_7B_tp1",
    "parameters": {
      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
      "tensor_parallel_size": 1,
      "backend": "vllm-chat",
      "dataset_name": "hf",
      "hf_split": "train",
      "max_model_len": 16384,
      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
      "num_prompts": 200
    }
  },
  {
    "test_name": "throughput_qwen2_5_7B_tp1",
    "parameters": {
      "model": "Qwen/Qwen2.5-7B-Instruct",
      "tensor_parallel_size": 1,
      "load_format": "dummy",
      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
      "num_prompts": 200,
      "backend": "vllm"
    }
  }
 ]
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -0,0 +1,133 @@
 #
 # Attempt to find the python package that uses the same python executable as
 # `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
 #
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
    message(FATAL_ERROR
      "Python version (${_VER}) is not one of the supported versions: "
      "${_SUPPORTED_VERSIONS_LIST}.")
  endif()
  message(STATUS "Found python matching: ${EXECUTABLE}.")
 endmacro()
 #
 # Run `EXPR` in python.  The standard output of python is stored in `OUT` and
 # has trailing whitespace stripped.  If an error is encountered when running
 # python, a fatal message `ERR_MSG` is issued.
 #
 function (run_python OUT EXPR ERR_MSG)
  execute_process(
    COMMAND
    "${PYTHON_EXECUTABLE}" "-c" "${EXPR}"
    OUTPUT_VARIABLE PYTHON_OUT
    RESULT_VARIABLE PYTHON_ERROR_CODE
    ERROR_VARIABLE PYTHON_STDERR
    OUTPUT_STRIP_TRAILING_WHITESPACE)
  if(NOT PYTHON_ERROR_CODE EQUAL 0)
    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
  endif()
  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
 endfunction()
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
  run_python(_PREFIX_PATH
    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
 endmacro()
 # This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake
 # Define a target named `GPU_MOD_NAME` for a single extension. The
 # arguments are:
 #
 # DESTINATION <dest>         - Module destination directory.
 # LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
 #                              etc.
 # SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
 #
 # Optional arguments:
 #
 # ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
 #                              format.
 #                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
 #                              and `CMAKE_HIP_ARCHITECTURES` for more info.
 #                              ARCHITECTURES will use cmake's defaults if
 #                              not provided.
 # COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
 # USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
 function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
    "DESTINATION;LANGUAGE;USE_SABI"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
  # Add hipify preprocessing step when building with HIP/ROCm.
  if (GPU_LANGUAGE STREQUAL "HIP")
    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
  endif()
  if (GPU_WITH_SOABI)
    set(GPU_WITH_SOABI WITH_SOABI)
  else()
    set(GPU_WITH_SOABI)
  endif()
  if (GPU_USE_SABI)
    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
  else()
    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
  endif()
  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
  endif()
  if (GPU_ARCHITECTURES)
    set_target_properties(${GPU_MOD_NAME} PROPERTIES
      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
  endif()
  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
  target_compile_options(${GPU_MOD_NAME} PRIVATE
    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
    ${GPU_INCLUDE_DIRECTORIES})
  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
  # dependencies that are not necessary and may not be installed.
  if (GPU_LANGUAGE STREQUAL "CUDA")
    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
  else()
    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
  endif()
  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
 endfunction()
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,28 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
 coverage:
  status:
    # Patch coverage is mandatory and must be >= 80%
    patch:
      default:
        target: 80%
    # non-voting
    project:
      default:
        # non-voting
        informational: true
--- a/collect_env.py
+++ b/collect_env.py
@@ -0,0 +1,489 @@
 #
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Adapted from https://github.com/vllm-project/vllm/blob/main/collect_env.py
 #
 import datetime
 import locale
 import os
 import re
 import subprocess
 import sys
 from collections import namedtuple
 from vllm.envs import environment_variables
 try:
    import torch
    TORCH_AVAILABLE = True
 except (ImportError, NameError, AttributeError, OSError):
    TORCH_AVAILABLE = False
 # System Environment Information
 SystemEnv = namedtuple(
    'SystemEnv',
    [
        'torch_version',
        'is_debug_build',
        'gcc_version',
        'clang_version',
        'cmake_version',
        'os',
        'libc_version',
        'python_version',
        'python_platform',
        'pip_version',  # 'pip' or 'pip3'
        'pip_packages',
        'conda_packages',
        'cpu_info',
        'vllm_version',  # vllm specific field
        'vllm_ascend_version',  # vllm ascend specific field
        'env_vars',
        'npu_info',  # ascend specific field
        'cann_info',  # ascend specific field
    ])
 DEFAULT_CONDA_PATTERNS = {
    "torch",
    "numpy",
    "soumith",
    "mkl",
    "magma",
    "optree",
    "transformers",
    "zmq",
    "pynvml",
 }
 DEFAULT_PIP_PATTERNS = {
    "torch",
    "numpy",
    "mypy",
    "flake8",
    "optree",
    "onnx",
    "transformers",
    "zmq",
    "pynvml",
 }
 def run(command):
    """Return (return-code, stdout, stderr)."""
    shell = True if type(command) is str else False
    p = subprocess.Popen(command,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         shell=shell)
    raw_output, raw_err = p.communicate()
    rc = p.returncode
    if get_platform() == 'win32':
        enc = 'oem'
    else:
        enc = locale.getpreferredencoding()
    output = raw_output.decode(enc)
    err = raw_err.decode(enc)
    return rc, output.strip(), err.strip()
 def run_and_read_all(run_lambda, command):
    """Run command using run_lambda; reads and returns entire output if rc is 0."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out
 def run_and_parse_first_match(run_lambda, command, regex):
    """Run command using run_lambda, returns the first regex match if it exists."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    match = re.search(regex, out)
    if match is None:
        return None
    return match.group(1)
 def run_and_return_first_line(run_lambda, command):
    """Run command using run_lambda and returns first line if output is not empty."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out.split('\n')[0]
 def get_conda_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_CONDA_PATTERNS
    conda = os.environ.get('CONDA_EXE', 'conda')
    out = run_and_read_all(run_lambda, "{} list".format(conda))
    if out is None:
        return out
    return "\n".join(line for line in out.splitlines()
                     if not line.startswith("#") and any(name in line
                                                         for name in patterns))
 def get_gcc_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
 def get_clang_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'clang --version',
                                     r'clang version (.*)')
 def get_cmake_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'cmake --version',
                                     r'cmake (.*)')
 def _parse_version(version, version_tuple):
    version_str = version_tuple[-1]
    if isinstance(version_str, str) and version_str.startswith('g'):
        if '.' in version_str:
            git_sha = version_str.split('.')[0][1:]
            date = version_str.split('.')[-1][1:]
            return f"{version} (git sha: {git_sha}, date: {date})"
        else:
            git_sha = version_str[1:]  # type: ignore
            return f"{version} (git sha: {git_sha})"
    return version
 def get_vllm_version():
    from vllm import __version__, __version_tuple__
    return _parse_version(__version__, __version_tuple__)
 def get_vllm_ascend_version():
    from vllm_ascend._version import __version__, __version_tuple__
    return _parse_version(__version__, __version_tuple__)
 def get_cpu_info(run_lambda):
    rc, out, err = 0, '', ''
    if get_platform() == 'linux':
        rc, out, err = run_lambda('lscpu')
    elif get_platform() == 'win32':
        rc, out, err = run_lambda(
            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
        )
    elif get_platform() == 'darwin':
        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
    cpu_info = 'None'
    if rc == 0:
        cpu_info = out
    else:
        cpu_info = err
    return cpu_info
 def get_platform():
    if sys.platform.startswith('linux'):
        return 'linux'
    elif sys.platform.startswith('win32'):
        return 'win32'
    elif sys.platform.startswith('cygwin'):
        return 'cygwin'
    elif sys.platform.startswith('darwin'):
        return 'darwin'
    else:
        return sys.platform
 def get_mac_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
                                     r'(.*)')
 def get_windows_version(run_lambda):
    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
    return run_and_read_all(
        run_lambda,
        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
 def get_lsb_version(run_lambda):
    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
                                     r'Description:\t(.*)')
 def check_release_file(run_lambda):
    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
                                     r'PRETTY_NAME="(.*)"')
 def get_os(run_lambda):
    from platform import machine
    platform = get_platform()
    if platform == 'win32' or platform == 'cygwin':
        return get_windows_version(run_lambda)
    if platform == 'darwin':
        version = get_mac_version(run_lambda)
        if version is None:
            return None
        return 'macOS {} ({})'.format(version, machine())
    if platform == 'linux':
        # Ubuntu/Debian based
        desc = get_lsb_version(run_lambda)
        if desc is not None:
            return '{} ({})'.format(desc, machine())
        # Try reading /etc/*-release
        desc = check_release_file(run_lambda)
        if desc is not None:
            return '{} ({})'.format(desc, machine())
        return '{} ({})'.format(platform, machine())
    # Unknown platform
    return platform
 def get_python_platform():
    import platform
    return platform.platform()
 def get_libc_version():
    import platform
    if get_platform() != 'linux':
        return 'N/A'
    return '-'.join(platform.libc_ver())
 def get_pip_packages(run_lambda, patterns=None):
    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS
    # People generally have `pip` as `pip` or `pip3`
    # But here it is invoked as `python -mpip`
    def run_with_pip(pip):
        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
        return "\n".join(line for line in out.splitlines()
                         if any(name in line for name in patterns))
    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
    out = run_with_pip([sys.executable, '-mpip'])
    return pip_version, out
 def get_npu_info(run_lambda):
    return run_and_read_all(run_lambda, 'npu-smi info')
 def get_cann_info(run_lambda):
    out = run_and_read_all(run_lambda, 'lscpu | grep Architecture:')
    cpu_arch = str(out).split()[-1]
    return run_and_read_all(
        run_lambda,
        'cat /usr/local/Ascend/ascend-toolkit/latest/{}-linux/ascend_toolkit_install.info'
        .format(cpu_arch))
 def get_env_vars():
    env_vars = ''
    secret_terms = ('secret', 'token', 'api', 'access', 'password')
    report_prefix = ("TORCH", "PYTORCH", "ASCEND_", "ATB_")
    for k, v in os.environ.items():
        if any(term in k.lower() for term in secret_terms):
            continue
        if k in environment_variables:
            env_vars = env_vars + "{}={}".format(k, v) + "\n"
        if k.startswith(report_prefix):
            env_vars = env_vars + "{}={}".format(k, v) + "\n"
    return env_vars
 def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)
    if TORCH_AVAILABLE:
        version_str = torch.__version__
        debug_mode_str = str(torch.version.debug)
    else:
        version_str = debug_mode_str = 'N/A'
    sys_version = sys.version.replace("\n", " ")
    conda_packages = get_conda_packages(run_lambda)
    return SystemEnv(
        torch_version=version_str,
        is_debug_build=debug_mode_str,
        python_version='{} ({}-bit runtime)'.format(
            sys_version,
            sys.maxsize.bit_length() + 1),
        python_platform=get_python_platform(),
        pip_version=pip_version,
        pip_packages=pip_list_output,
        conda_packages=conda_packages,
        os=get_os(run_lambda),
        libc_version=get_libc_version(),
        gcc_version=get_gcc_version(run_lambda),
        clang_version=get_clang_version(run_lambda),
        cmake_version=get_cmake_version(run_lambda),
        cpu_info=get_cpu_info(run_lambda),
        vllm_version=get_vllm_version(),
        vllm_ascend_version=get_vllm_ascend_version(),
        env_vars=get_env_vars(),
        npu_info=get_npu_info(run_lambda),
        cann_info=get_cann_info(run_lambda),
    )
 env_info_fmt = """
 PyTorch version: {torch_version}
 Is debug build: {is_debug_build}
 OS: {os}
 GCC version: {gcc_version}
 Clang version: {clang_version}
 CMake version: {cmake_version}
 Libc version: {libc_version}
 Python version: {python_version}
 Python platform: {python_platform}
 CPU:
 {cpu_info}
 Versions of relevant libraries:
 {pip_packages}
 {conda_packages}
 """.strip()
 # both the above code and the following code use `strip()` to
 # remove leading/trailing whitespaces, so we need to add a newline
 # in between to separate the two sections
 env_info_fmt += "\n"
 env_info_fmt += """
 vLLM Version: {vllm_version}
 vLLM Ascend Version: {vllm_ascend_version}
 ENV Variables:
 {env_vars}
 NPU:
 {npu_info}
 CANN:
 {cann_info}
 """.strip()
 def pretty_str(envinfo):
    def replace_nones(dct, replacement='Could not collect'):
        for key in dct.keys():
            if dct[key] is not None:
                continue
            dct[key] = replacement
        return dct
    def replace_bools(dct, true='Yes', false='No'):
        for key in dct.keys():
            if dct[key] is True:
                dct[key] = true
            elif dct[key] is False:
                dct[key] = false
        return dct
    def prepend(text, tag='[prepend]'):
        lines = text.split('\n')
        updated_lines = [tag + line for line in lines]
        return '\n'.join(updated_lines)
    def replace_if_empty(text, replacement='No relevant packages'):
        if text is not None and len(text) == 0:
            return replacement
        return text
    def maybe_start_on_next_line(string):
        # If `string` is multiline, prepend a \n to it.
        if string is not None and len(string.split('\n')) > 1:
            return '\n{}\n'.format(string)
        return string
    mutable_dict = envinfo._asdict()
    # Replace True with Yes, False with No
    mutable_dict = replace_bools(mutable_dict)
    # Replace all None objects with 'Could not collect'
    mutable_dict = replace_nones(mutable_dict)
    # If either of these are '', replace with 'No relevant packages'
    mutable_dict['pip_packages'] = replace_if_empty(
        mutable_dict['pip_packages'])
    mutable_dict['conda_packages'] = replace_if_empty(
        mutable_dict['conda_packages'])
    # Tag conda and pip packages with a prefix
    # If they were previously None, they'll show up as ie '[conda] Could not collect'
    if mutable_dict['pip_packages']:
        mutable_dict['pip_packages'] = prepend(
            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
    if mutable_dict['conda_packages']:
        mutable_dict['conda_packages'] = prepend(
            mutable_dict['conda_packages'], '[conda] ')
    mutable_dict['cpu_info'] = envinfo.cpu_info
    mutable_dict['npu_info'] = envinfo.npu_info
    mutable_dict['cann_info'] = envinfo.cann_info
    return env_info_fmt.format(**mutable_dict)
 def get_pretty_env_info():
    return pretty_str(get_env_info())
 def main():
    print("Collecting environment information...")
    output = get_pretty_env_info()
    print(output)
    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
            torch.utils, '_crash_handler'):
        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
        if sys.platform == "linux" and os.path.exists(minidump_dir):
            dumps = [
                os.path.join(minidump_dir, dump)
                for dump in os.listdir(minidump_dir)
            ]
            latest = max(dumps, key=os.path.getctime)
            ctime = os.path.getctime(latest)
            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
                '%Y-%m-%d %H:%M:%S')
            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
                  "if this is related to your bug please include it when you file a report ***"
            print(msg, file=sys.stderr)
 if __name__ == '__main__':
    main()
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -0,0 +1,338 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 extern "C" {
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 #include <sys/types.h>
 #include "acl/acl.h"
 // Global references to Python callables
 // NOTE: this is borrowed reference, so we don't need to DECREF them.
 // This brings the limitation that the allocator needs to be singleton.
 static PyObject* g_python_malloc_callback = nullptr;
 static PyObject* g_python_free_callback = nullptr;
 // ---------------------------------------------------------------------------
 // Helper functions:
 void ensure_context(unsigned long long device) {
  aclrtContext pctx;
  aclrtGetCurrentContext(&pctx);
  if (!pctx) {
    // Ensure device context.
    aclrtCreateContext(&pctx, device);
    aclrtSetCurrentContext(pctx);
  }
 }
 void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
                    aclrtDrvMemHandle* p_memHandle) {
  ensure_context(device);
  // Define memory allocation properties
  aclrtPhysicalMemProp prop = {};
  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
  prop.memAttr = ACL_HBM_MEM_HUGE;
  prop.location.id = device;
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
  prop.reserve = 0;
  // Allocate memory using aclrtMallocPhysical
  aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
            << __LINE__ << std::endl;  
    return;
  }
  error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
            << __LINE__ << std::endl;  
    return;
  }
 }
 void unmap_and_release(unsigned long long device, ssize_t size,
                       void* d_mem,
                       aclrtDrvMemHandle* p_memHandle) {
  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
  ensure_context(device);
  aclError error_code = aclrtUnmapMem(d_mem);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
            << __LINE__ << std::endl;  
    return;
  }
  error_code = aclrtFreePhysical(*p_memHandle);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
            << __LINE__ << std::endl;  
    return;
  }
 }
 PyObject* create_tuple_from_c_integers(unsigned long long a,
                                       unsigned long long b,
                                       unsigned long long c,
                                       unsigned long long d) {
  // Create a new tuple of size 4
  PyObject* tuple = PyTuple_New(4);
  if (!tuple) {
    return NULL;  // Return NULL on failure
  }
  // Convert integers to Python objects and set them in the tuple
  PyTuple_SetItem(
      tuple, 0,
      PyLong_FromUnsignedLongLong(a));  // Steals reference to the PyLong
  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
  PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
  // Note: PyTuple_SetItem "steals" a reference to each object,
  // so we do not need to Py_DECREF the PyLong objects explicitly.
  return tuple;  // Return the created tuple
 }
 // ---------------------------------------------------------------------------
 // Our exported C functions that call Python:
 __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, aclrtStream stream) {
  ensure_context(device);
  // first allocation, align the size, and reserve an address, and also allocate
  // a aclrtDrvMemHandle
  // Define memory allocation properties
  aclrtPhysicalMemProp prop = {};
  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
  prop.memAttr = ACL_HBM_MEM_HUGE;
  prop.location.id = device;
  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
  prop.reserve = 0;
  // Check if the allocation is supported
  size_t granularity;
  aclError error_code = aclrtMemGetAllocationGranularity(&prop,
                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
                                   &granularity);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
            << __LINE__ << std::endl;  
    return nullptr;
  }
  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
  void *d_mem;
  error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
                << __LINE__ << std::endl;  
    return nullptr;
  }
  // allocate the aclrtDrvMemHandle
  aclrtDrvMemHandle* p_memHandle =
      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
  if (!g_python_malloc_callback) {
    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
    return nullptr;
  }
  // Acquire GIL (not in stable ABI officially, but often works)
  PyGILState_STATE gstate = PyGILState_Ensure();
  PyObject* arg_tuple = create_tuple_from_c_integers(
      (unsigned long long)device, (unsigned long long)alignedSize,
      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
  // Call g_python_malloc_callback
  PyObject* py_result =
      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
  Py_DECREF(arg_tuple);
  if (!py_result) {
    PyErr_Print();
    PyGILState_Release(gstate);
    return nullptr;
  }
  PyGILState_Release(gstate);
  // do the final mapping
  create_and_map(device, alignedSize, d_mem, p_memHandle);
  return (void*)d_mem;
 }
 __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
  // get memory handle from the pointer
  if (!g_python_free_callback) {
    std::cerr << "ERROR: g_python_free_callback not set.\n";
    return;
  }
  // Acquire GIL (not in stable ABI officially, but often works)
  PyGILState_STATE gstate = PyGILState_Ensure();
  PyObject* py_ptr =
      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
  PyObject* py_result =
      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
    return;
  }
  unsigned long long recv_device, recv_size;
  unsigned long long recv_d_mem, recv_p_memHandle;
  // Unpack the tuple into four C integers
  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
                        &recv_d_mem, &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
    return;
  }
  PyGILState_Release(gstate);
  // recv_size == size
  // recv_device == device
  // Free memory
  void *d_mem = (void*)recv_d_mem;
    // allocate the aclrtDrvMemHandle
  aclrtDrvMemHandle* p_memHandle =
      (aclrtDrvMemHandle*)recv_p_memHandle;
  unmap_and_release(device, size, d_mem, p_memHandle);
  // free address and the handle
  aclError error_code = aclrtReleaseMemAddress(d_mem);
  if (error_code != 0) {
    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
        << __LINE__ << std::endl;  
    return;
  }
  free(p_memHandle);
 }
 // ---------------------------------------------------------------------------
 // Python extension boilerplate:
 // Python-exposed function: init_module(python_malloc, python_free)
 static PyObject* py_init_module(PyObject* self, PyObject* args) {
  PyObject* malloc_callback = nullptr;
  PyObject* free_callback = nullptr;
  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
    return nullptr;
  }
  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
    return nullptr;
  }
  // Save the Python callables
  // This module does not handle GC of these objects, so they must be kept alive
  // outside of this module.
  g_python_malloc_callback = malloc_callback;
  g_python_free_callback = free_callback;
  Py_RETURN_NONE;
 }
 static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
    return nullptr;
  }
  unsigned long long recv_device, recv_size;
  unsigned long long recv_d_mem, recv_p_memHandle;
  // Unpack the tuple into four C integers
  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
                        &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
    return nullptr;
  }
  void *d_mem_ptr = (void*)recv_d_mem;
  aclrtDrvMemHandle* p_memHandle =
      (aclrtDrvMemHandle*)recv_p_memHandle;
  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
  Py_RETURN_NONE;
 }
 static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
    return nullptr;
  }
  unsigned long long recv_device, recv_size;
  unsigned long long recv_d_mem, recv_p_memHandle;
  // Unpack the tuple into four C integers
  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
                        &recv_p_memHandle)) {
    // PyArg_ParseTuple sets an error if it fails
    return nullptr;
  }
  void *d_mem_ptr = (void*)recv_d_mem;
  aclrtDrvMemHandle* p_memHandle =
      (aclrtDrvMemHandle*)recv_p_memHandle;
  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
  Py_RETURN_NONE;
 }
 static PyMethodDef module_methods[] = {
    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
     "Initialize module with python_malloc and python_free callables."},
    {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
     "Create and map memory on the device."},
    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
     METH_VARARGS, "Unmap and release memory on the device."},
    {NULL, NULL, 0, NULL}  // sentinel
 };
 static struct PyModuleDef camem_allocator_module = {
    PyModuleDef_HEAD_INIT, "camem_allocator",
    "CANN-mem-based allocator for NPUPluggableAllocator", -1, module_methods};
 PyMODINIT_FUNC PyInit_vllm_ascend_C(void) {
  // Initialize the module
  PyObject* module = PyModule_Create(&camem_allocator_module);
  if (!module) {
    return NULL;
  }
  return module;
 }
 }  // extern "C"
--- a/csrc/kernels/bgmv_expand.cpp
+++ b/csrc/kernels/bgmv_expand.cpp
@@ -0,0 +1,369 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kernel_operator.h"
 #include "types.h"
 template <typename scalar_t>
 class BGMVExpand {
 public:
    using X_T = float;
    using W_T = scalar_t;
    using Y_T = scalar_t;
    static constexpr uint64_t LORA_RANK_8 = 8;
    static constexpr uint64_t LORA_RANK_16 = 16;
    static constexpr uint64_t LORA_RANK_32 = 32;
    static constexpr uint64_t LORA_RANK_64 = 64;
    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
    static constexpr int32_t BUFFER_NUM = 2;
    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
    // Mask is used to control the elements that participate in computation in each iteration.
    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
    // So need to read them all and apply PairReduceSum
    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
    // Same for rank=64, we do not support ranks greater than 64.
    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
 public:
    __aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,
                                uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,
                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
    {
        batchSize_ = batchSize;
        numTokensPerCore_ = numTokensPerCore;
        maxLoRARank_ = maxLoRARank;
        outputHiddenDim_ = outputHiddenDim;
        sliceOffset_ = sliceOffset;
        outputFullDim_ = outputFullDim;
        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
        // Each compute iteration would generate not one, but several output elements.
        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
    }
    __aicore__ inline void Process()
    {
        int64_t blockIdx = AscendC::GetBlockIdx();
        int64_t startIdx = blockIdx * numTokensPerCore_;
        int64_t endIdx = startIdx + numTokensPerCore_;
        if (endIdx > batchSize_) {
            endIdx = batchSize_;
        }
        for (int64_t idx = startIdx; idx < endIdx; idx++) {
            yOffset_ = outputFullDim_ * idx + sliceOffset_;
            // Set up LoRA index
            CopyInIndex(idx);
            if (reqLoRAIndex_ < 0) {
                continue;
            }
            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
            CopyInX(idx);
            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
            for (int32_t i = 0; i < numStreamOut; i++) {
                CopyInY(i);
                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
                    CopyInW(i * numStreamInPerOutputTile_ + j);
                    Compute(j * numOutputElementsPerInputTile_);
                }
                ScaleOutput();
                CopyOut(i);
            }
            ComputeLastIteration();
        }
    }
 private:
    __aicore__ inline void CopyInIndex(const int64_t idx)
    {
        // Look up the LoRA index
        reqLoRAIndex_ = indicesGm_.GetValue(idx);
    }
    __aicore__ inline void ComputeLastIteration()
    {
        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
        if (remainingY == 0) {
            return;
        }
        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
        int32_t remainingW = remainingY * maxLoRARank_;
        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
        CopyInY(numStreamOut, remainingY);
        int32_t outputIdx = 0;
        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
            Compute(outputIdx * numOutputElementsPerInputTile_);
        }
        if (remainingWForLastRepeat != 0) {
            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
                    remainingWForLastRepeat);
            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
            int32_t pairReduceRepeat16 = 
                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
        }
        ScaleOutput(remainingY);
        CopyOut(numStreamOut, remainingY);
    }
    __aicore__ inline void CopyInX(const int64_t idx)
    {
        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
        if constexpr (std::is_same_v<X_T, float>) {
            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
        } else {
            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
        }
        inQueueX_.EnQue(xLocal);
        xLocal = inQueueX_.DeQue<X_T>();
        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
        // As we are generating multiple output elements with one API invocation,
        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
        if constexpr (std::is_same_v<X_T, float>) {
            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
                for (int32_t j = 0; j < maxLoRARank_; j++) {
                    float entry = xLocal.GetValue(j);
                    xDup.SetValue(i + j, entry);
                }
            }
        } else {
            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
            pipe_barrier(PIPE_V);
            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
                for (int32_t j = 0; j < maxLoRARank_; j++) {
                    float entry = xDup.GetValue(j);
                    xDup.SetValue(i + j, entry);
                }
            }
        }
        inQueueX_.FreeTensor(xLocal);
    }
    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
        inQueueY_.EnQue(yInLocal);
    }
    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
        inQueueW_.EnQue(wLocal);
    }
    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
        pipe_barrier(PIPE_V);
        inQueueY_.FreeTensor(yInLocal);
        Add(yLocal, yLocal, yInLocalFP32, numElements);
        pipe_barrier(PIPE_V);
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
        pipe_barrier(PIPE_V);
        outQueueY_.EnQue<Y_T>(yOutLocal);
    }
    __aicore__ inline void Compute(int32_t progress,
                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
    {
        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
        pipe_barrier(PIPE_V);
        inQueueW_.FreeTensor(wLocal);
        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
        pipe_barrier(PIPE_V);
        if (maxLoRARank_ == LORA_RANK_8) {
            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_16) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_32) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_64) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        }
    }
    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
        outQueueY_.FreeTensor(yOutLocal);
    }
 private:
    AscendC::TPipe* pipe_;
    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
    AscendC::GlobalTensor<X_T> xGm_;
    AscendC::GlobalTensor<W_T> wGm_;
    AscendC::GlobalTensor<Y_T> yInGm_;
    AscendC::GlobalTensor<Y_T> yOutGm_;
    AscendC::GlobalTensor<int64_t> indicesGm_;
    uint32_t batchSize_;
    uint32_t numTokensPerCore_;
    uint32_t maxLoRARank_;
    uint32_t outputHiddenDim_;
    uint32_t sliceOffset_;
    uint32_t outputFullDim_;
    uint32_t singleLoRAWeightLen_;
    int64_t reqLoRAIndex_;
    uint64_t reqLoRAWeightOffset_;
    uint32_t numOutputElementsPerInputTile_;
    uint32_t numStreamInPerOutputTile_;
    uint64_t yOffset_;
    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
    // reads next 8 consecutive blocks in the second repeat.
    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
    // so we set dstRepStride = 1
    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
    // For xDup we repeatedly use it, so we set src0RepStride = 0
    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
 };
 #define BGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
                                                             uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,\
                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        BGMVExpand<TYPE> op(&pipe);                                                                                    \
        op.Init(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                  \
                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
        op.Process();                                                                                                  \
    }
 // declare all dtype kernel
 BGMV_EXPAND_TYPE_DECLARE(half)
 #if (__CCE_AICORE__ >= 220)
    BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
 #endif
 namespace vllm_ascend {
 extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
        bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
                                                        maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
            bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
                                                                  sliceOffset, outputFullDim);
        #endif
    } else {
        return;
    }
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/bgmv_shrink.cpp
+++ b/csrc/kernels/bgmv_shrink.cpp
@@ -0,0 +1,252 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kernel_operator.h"
 #include "types.h"
 template <typename scalar_t>
 class BGMVShrink {
 public:
    using X_T = scalar_t;
    using W_T = scalar_t;
    using Y_T = float;
    static constexpr uint64_t BUFFER_NUM = 1;
    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
 public:
    __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, uint32_t indicesSize, __gm__ void *y,
                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                                uint32_t maxLoRARank, float scale)
    {
        batchSize_ =  batchSize;
        numTokensPerCore_ = numTokensPerCore;
        inputHiddenDim_ = inputHiddenDim;
        maxLoRARank_ = maxLoRARank;
        scale_ = scale;
        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
        incremental_ = inputHiddenDim_ > TILE_LENGTH;
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        int64_t blockIdx = AscendC::GetBlockIdx();
        int64_t startIdx = blockIdx * numTokensPerCore_;
        int64_t endIdx = startIdx + numTokensPerCore_;
        if (endIdx > batchSize_) {
            endIdx = batchSize_;
        }
        for (int64_t idx = startIdx; idx < endIdx; idx++) {
            // set up LoRA index
            CopyInIndex(idx);
            if (reqLoRAIndex_ < 0) {
                continue;
            }
            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
            if (incremental_) {
                ProcessImpl<true>(idx);
            } else {
                ProcessImpl<false>(idx);
            }
            ScaleOutput();
            CopyOut(idx);
        }
    }
 private:
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void ProcessImpl(const int64_t idx)
    {
        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
        if constexpr (!INCREMENTAL_MODE) {
            CopyInX(idx, 0, inputHiddenDim_);
            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
            pipe_barrier(PIPE_V);
            inQueueX_.FreeTensor(xLocal);
        }
        for (int i = 0; i < maxLoRARank_; i++) {
            float acc(0);
            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
                if constexpr (INCREMENTAL_MODE) {
                    CopyInX(idx, j);
                }
                CopyInW(i, j);
                Compute<INCREMENTAL_MODE>(acc);
            }
            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
            yOutLocal.SetValue(i, acc);
        }
    }
    __aicore__ inline void CopyInIndex(const int64_t idx)
    {
        // look up the LoRA index
        reqLoRAIndex_ = indicesGm_.GetValue(idx);
    }
    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
        inQueueX_.EnQue(xLocal);
    }
    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
        inQueueW_.EnQue(wLocal);
    }
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
        if constexpr (INCREMENTAL_MODE) {
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
            pipe_barrier(PIPE_V);
            inQueueX_.FreeTensor(xLocal);
            inQueueW_.FreeTensor(wLocal);
        } else {
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
            pipe_barrier(PIPE_V);
            inQueueW_.FreeTensor(wLocal);
        }
        // dot product of the one tile of X and W 
        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
        pipe_barrier(PIPE_V);
        // reduce sum generate one number, which is the summation of all the dot product
        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
        pipe_barrier(PIPE_V);
        acc += wTmpTensor.GetValue(0);
    }
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
    {
        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
        if (remaining == 0) {
            return;
        }
        if constexpr (INCREMENTAL_MODE) {
            CopyInX(idx, colIdx, remaining);
        }
        CopyInW(rowIdx, colIdx, remaining);
        Compute<INCREMENTAL_MODE>(acc, remaining);
    }
    __aicore__ inline void ScaleOutput()
    {
        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
        pipe_barrier(PIPE_V);
        outQueueY_.EnQue<Y_T>(yOutLocal);
    }
    __aicore__ inline void CopyOut(const int64_t idx)
    {
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
        outQueueY_.FreeTensor(yOutLocal);
    }
 private:
    AscendC::TPipe *pipe_;
    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
    AscendC::GlobalTensor<X_T> xGm_;
    AscendC::GlobalTensor<W_T> wGm_;
    AscendC::GlobalTensor<int64_t> indicesGm_;
    AscendC::GlobalTensor<Y_T> yOutGm_;
    uint32_t batchSize_;
    uint32_t numTokensPerCore_;
    uint32_t inputHiddenDim_;
    uint32_t maxLoRARank_;
    float scale_;
    uint32_t singleLoRAWeightLen_;
    int64_t reqLoRAIndex_;
    uint64_t reqLoRAWeightOffset_;
    bool incremental_;
 };
 #define BGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
                                                             uint32_t indicesSize, __gm__ void* y, uint32_t batchSize, \
                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
                                                             uint32_t maxLoRARank, float scale)                        \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        BGMVShrink<TYPE> op(&pipe);                                                                                    \
        op.Init(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);  \
        op.Process();                                                                                                  \
    }
 // declare all dtype kernel
 BGMV_SHRINK_TYPE_DECLARE(half)
 #if (__CCE_AICORE__ >= 220)
    BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
 #endif
 namespace vllm_ascend {
 extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                             uint32_t maxLoRARank, float scale)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                        inputHiddenDim, maxLoRARank, scale);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
                                                                  inputHiddenDim, maxLoRARank, scale);
        #endif
    } else {
        return;
    }
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/get_masked_input_and_mask_kernel.cpp
+++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
@@ -0,0 +1,378 @@
 /* 
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 */
 #include "kernel_operator.h"
 #include "kernel_tensor_impl.h"
 #include "kernel_type.h"
 #include "types.h"
 #include "utils.h"
 using vllm_ascend::AccType;
 template<typename scalar_t>
 class GetMaskedInputAndMask {
 public:
    __aicore__ inline GetMaskedInputAndMask() {}
    __aicore__ inline ~GetMaskedInputAndMask() {
        pipe.Reset();
    }
    __aicore__ inline void Init(
        __gm__ scalar_t* input,
        __gm__ scalar_t* masked_input, 
        __gm__ bool* mask_out,
        const int64_t org_vocab_start_index,
        const int64_t org_vocab_end_index,
        const int64_t num_org_vocab_padding,
        const int64_t added_vocab_start_index,
        const int64_t added_vocab_end_index,
        const int64_t size)
    {
        // Initialize basic parameters
        input_ = input;
        masked_input_ = masked_input;
        mask_out_ = mask_out;
        org_vocab_start_index_ = org_vocab_start_index;
        org_vocab_end_index_ = org_vocab_end_index;
        size_ = ((size + 31) / 32) * 32;
        added_offset_ = added_vocab_start_index - 
            (org_vocab_end_index - org_vocab_start_index) - 
            num_org_vocab_padding;
        added_vocab_start_index_ = added_vocab_start_index;
        added_vocab_end_index_ = added_vocab_end_index;
        // Initialize global tensors
        inputGlobal.SetGlobalBuffer(input);
        maskedOutputGlobal.SetGlobalBuffer(masked_input); 
        maskOutGlobal.SetGlobalBuffer(mask_out);
        // Initialize queues
        pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t));
        pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t));
        pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool));
        // Initialize calculation buffers
        // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs.
        pipe.InitBuffer(calc_buf_1, size_ * sizeof(float));
        pipe.InitBuffer(calc_buf_2, size_ * sizeof(float));
        // Initialize result queues
        pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float));
        pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float));
        pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float));
        pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float));
        // Initialize temporary buffers
        pipe.InitBuffer(start_buf, size_ * sizeof(float));
        pipe.InitBuffer(end_buf, size_ * sizeof(float));
        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting
        pipe.InitBuffer(validOffset_buf, size_ * sizeof(float));
        pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t));
        pipe.InitBuffer(ones_buf_, size_ * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
 private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.AllocTensor<scalar_t>();
        AscendC::DataCopy(inputLocal, inputGlobal, size_);
        inQueue.EnQue(inputLocal);
    }
    __aicore__ inline void CompareWithValue(
        AscendC::LocalTensor<int8_t>& result,
        const AscendC::LocalTensor<float>& input,
        const AscendC::LocalTensor<float>& compare_value,
        bool is_greater_equal) {
        AscendC::LocalTensor<float> compute_buf = calc_buf_1.Get<float>();
        if (is_greater_equal) {
            AscendC::Max(compute_buf, input, compare_value, size_);  
            AscendC::Sub(compute_buf, compare_value, compute_buf, size_);  
        } else {
            AscendC::Max(compute_buf, input, compare_value, size_); 
            AscendC::Sub(compute_buf, compute_buf, compare_value, size_); 
        }
        AscendC::Abs(compute_buf, compute_buf, size_);
        AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_);
        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_);
        AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_);
        AscendC::Abs(compute_buf, compute_buf, size_);
        AscendC::LocalTensor<half> compute_buf_fp16 = calc_buf_2.Get<half>();
        AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_);
    }
    __aicore__ inline void ComputeRangeMask(
        AscendC::LocalTensor<int8_t>& range_mask,
        const AscendC::LocalTensor<float>& input,
        const float start_value, 
        const float end_value) {
        AscendC::LocalTensor<float> start_value_tensor = start_buf.Get<float>();
        AscendC::LocalTensor<float> end_value_tensor = end_buf.Get<float>();
        AscendC::Duplicate(start_value_tensor, start_value, size_);
        AscendC::Duplicate(end_value_tensor, end_value, size_);
        AscendC::LocalTensor<int8_t> ge_result = result_ge_que.AllocTensor<int8_t>();
        AscendC::LocalTensor<int8_t> lt_result = result_le_que.AllocTensor<int8_t>();
        CompareWithValue(ge_result, start_value_tensor, input, true);
        CompareWithValue(lt_result, input, end_value_tensor, false);
 #if (__CCE_AICORE__ >= 220) 
        AscendC::And(range_mask, ge_result, lt_result, size_);
 #else
        {
            // WORKAROUND for older arch
            // No direct int8->int16 cast. Use half as intermediate.
            // No direct int8 And. Use int16 And.
            AscendC::LocalTensor<int16_t> ge_result_i16 = calc_buf_1.Get<int16_t>();
            AscendC::LocalTensor<int16_t> lt_result_i16 = calc_buf_2.Get<int16_t>();
            AscendC::LocalTensor<int16_t> range_mask_i16 = ge_result_i16; 
            // Use a temporary buffer for half type
            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
            // 1. Cast inputs: int8_t -> half -> int16_t
            AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
            // 2. Perform And on int16_t tensors
            AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_);
            // 3. Cast result back: int16_t -> half -> int8_t
            AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
        }
 #endif
    }
    __aicore__ inline void Compute() {
        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.DeQue<scalar_t>();
        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.AllocTensor<scalar_t>();
        AscendC::LocalTensor<int8_t> maskLocal = maskQueue.AllocTensor<int8_t>();
        AscendC::LocalTensor<float> inputFloat = inputFloat_buf.Get<float>();
        AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::LocalTensor<int8_t> orgVocabMask = result_org_mask_que.AllocTensor<int8_t>();
        ComputeRangeMask(orgVocabMask, 
                        inputFloat,
                        static_cast<float>(org_vocab_start_index_),
                        static_cast<float>(org_vocab_end_index_));
        AscendC::LocalTensor<int8_t> addedVocabMask = result_add_mask_que.AllocTensor<int8_t>();
        ComputeRangeMask(addedVocabMask,
                        inputFloat,
                        static_cast<float>(added_vocab_start_index_),
                        static_cast<float>(added_vocab_end_index_));
        AscendC::LocalTensor<float> validOffset = validOffset_buf.Get<float>();
        AscendC::LocalTensor<float> constOrgStartIndex = start_buf.Get<float>();
        AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_);
        AscendC::LocalTensor<half> orgVocabMask_fp16;
        AscendC::LocalTensor<float> orgVocabMask_fp32;
        AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_);
        AscendC::LocalTensor<float> addedOffset;
        AscendC::LocalTensor<float> addedOffsetTensor = end_buf.Get<float>();
        AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_);
        AscendC::LocalTensor<half> addedVocabMask_fp16;
        AscendC::LocalTensor<float> addedVocabMask_fp32;
        AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_);
        AscendC::Add(validOffset, validOffset, addedOffset, size_);
        AscendC::LocalTensor<int8_t> vocabMask = vocabMask_buf_.Get<int8_t>();
 #if (__CCE_AICORE__ >= 220)
        AscendC::Or(vocabMask,
                    orgVocabMask,
                    addedVocabMask,
                    size_);
 #else
        {
            // WORKAROUND for older arch 
            // No direct int8->int16 cast. Use half as intermediate.
            // No direct int8 Or. Use int16 Or.
            AscendC::LocalTensor<int16_t> orgVocabMask_i16 = calc_buf_1.Get<int16_t>();
            AscendC::LocalTensor<int16_t> addedVocabMask_i16 = calc_buf_2.Get<int16_t>();
            AscendC::LocalTensor<int16_t> vocabMask_i16 = orgVocabMask_i16; 
            // Use a temporary buffer for half type. inputFloat_buf is free now.
            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
            // 1. Cast inputs: int8_t -> half -> int16_t
            AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
            // 2. Perform Or on int16_t tensors
            AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_);
            // 3. Cast result back: int16_t -> half -> int8_t
            AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_);
            AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
        }
 #endif
        AscendC::Sub(inputFloat, inputFloat, validOffset, size_);
        AscendC::LocalTensor<half> vocabMask_fp16;
        AscendC::LocalTensor<float> vocabMask_fp32;
        AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_);
        AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_);  
        outQueue.EnQue(maskedLocal);
        AscendC::LocalTensor<float> ones_tensor = ones_buf_.Get<float>();
        AscendC::Duplicate(ones_tensor, (float)1, size_);
        AscendC::LocalTensor<float> maskLocal_fp32;
        AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_);
        AscendC::LocalTensor<half> maskLocal_fp16;
        AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_);
        AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_);
        maskQueue.EnQue(maskLocal);
        inQueue.FreeTensor(inputLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.DeQue<scalar_t>();
        AscendC::LocalTensor<bool> maskLocal = maskQueue.DeQue<bool>();
        AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_);
        AscendC::DataCopy(maskOutGlobal, maskLocal, size_);
        outQueue.FreeTensor(maskedLocal);
        maskQueue.FreeTensor(maskLocal);
    }
 private:
    static constexpr int32_t BUFFER_NUM = 2;
    AscendC::TPipe pipe;
    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue, maskQueue;
    AscendC::GlobalTensor<scalar_t> inputGlobal, maskedOutputGlobal;
    AscendC::GlobalTensor<bool> maskOutGlobal;
    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_1;
    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_2;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_ge_que;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_le_que;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_org_mask_que;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_add_mask_que;
    // Temporary buffers
    AscendC::TBuf<AscendC::TPosition::VECCALC> start_buf;
    AscendC::TBuf<AscendC::TPosition::VECCALC> end_buf; 
    AscendC::TBuf<AscendC::TPosition::VECCALC> inputFloat_buf;
    AscendC::TBuf<AscendC::TPosition::VECCALC> validOffset_buf;
    AscendC::TBuf<AscendC::TPosition::VECCALC> vocabMask_buf_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> ones_buf_;
    __gm__ scalar_t *input_, *masked_input_;
    __gm__ bool *mask_out_;
    int64_t size_;
    int64_t org_vocab_start_index_, org_vocab_end_index_;
    int64_t added_vocab_start_index_, added_vocab_end_index_;
    int64_t added_offset_;
    static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38;
    static constexpr float MAX_MUL_1_FP32 = 1125899906842624;
    static constexpr float MAX_MUL_2_FP32 = 67108864;
    static constexpr float NEGATIVE_ONE_FP32 = -1.0f;
 };
 extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel(
    __gm__ int32_t* input,
    __gm__ int32_t* masked_input,
    __gm__ bool* mask_out, 
    const int64_t org_vocab_start_index,
    const int64_t org_vocab_end_index,
    const int64_t num_org_vocab_padding,
    const int64_t added_vocab_start_index,
    const int64_t added_vocab_end_index,
    const int64_t size,
    const uint32_t loop_cnt,
    const uint32_t aiv_num)
 {
    {
        GetMaskedInputAndMask<int32_t> op{};
        for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) {
            op.Init(input + i * size/loop_cnt, 
                   masked_input + i * size/loop_cnt,
                   mask_out + i * size/loop_cnt,
                   org_vocab_start_index, org_vocab_end_index,
                   num_org_vocab_padding, added_vocab_start_index,
                   added_vocab_end_index, size/loop_cnt);
            op.Process();
        }
    } // op destructor called here
 }
 namespace vllm_ascend {
 void get_masked_input_and_mask_impl(
    void* stream,
    void* input,
    void* masked_input,
    void* mask_out,
    const int64_t org_vocab_start_index,
    const int64_t org_vocab_end_index,
    const int64_t num_org_vocab_padding, 
    const int64_t added_vocab_start_index,
    const int64_t added_vocab_end_index,
    const int64_t size,
    const uint32_t loop_cnt,
    const uint32_t aiv_num)
 {
    get_masked_input_and_mask_kernel<<<aiv_num, nullptr, stream>>>(
        static_cast<int32_t*>(input),
        static_cast<int32_t*>(masked_input),
        static_cast<bool*>(mask_out),
        org_vocab_start_index,
        org_vocab_end_index,
        num_org_vocab_padding,
        added_vocab_start_index,
        added_vocab_end_index,
        size,
        loop_cnt,
        aiv_num);
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/pos_encoding_kernels.cpp
+++ b/csrc/kernels/pos_encoding_kernels.cpp
@@ -0,0 +1,372 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kernel_operator.h"
 #include <stdio.h>
 #include "types.h"
 #include "utils.h"
 using vllm_ascend::AccType;
 using vllm_ascend::local_mem_copy;
 template <typename scalar_t, bool isNeox> class RotaryEmbedding {
    // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to
    // retrieve this size from runtime for more Soc support
    #if (__CCE_AICORE__ >= 220)
        static int constexpr loadSize = 512;
    #else
        static int constexpr loadSize = 1024 * 4;
    #endif
    using dst_t = scalar_t;
    using acc_t = typename AccType<scalar_t>::type;
    // only half tensor have cast instruct to int8, hardcode acc_dst_t as half
    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
    using local_acc_t = AscendC::LocalTensor<acc_t>;
    using local_dst_t = AscendC::LocalTensor<dst_t>;
 public:
    __aicore__ inline RotaryEmbedding()
    {
    }
    // Allocate buffers for input and output queue and the temp buffer used during kernel compute process,
    // this init process happens only in the kernel compute on a single vector core.
    __aicore__ inline void init(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
                                __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
                                const int rotDim, const int64_t dstQueryStride,
                                const int64_t dstKeyStride, const int64_t queryStride, const int64_t keyStride,
                                const int numHeads, const int numKvHeads, const int headSize, AscendC::TPipe *pipe)
    {
        pipe_ = pipe;
        rotDim_ = rotDim;
        // query stride and key stride is used to handle the strided tensor which is not contiguous on num_tokens dim
        queryStride_ = queryStride;
        keyStride_ = keyStride;
        dstQueryStride_ = dstQueryStride;
        dstKeyStride_ = dstKeyStride;
        numHeads_ = numHeads;
        numKvHeads_ = numKvHeads;
        headSize_ = headSize;
        embedDim_ = rotDim / 2;
        pipe_->InitBuffer(inQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
        pipe_->InitBuffer(inQueSinCos_, 1 /* buffer_num */, rotDim_ * sizeof(scalar_t) /* buffer_size */);
        pipe_->InitBuffer(outQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
        // 2 temporary calculation buffer
        calcTmpBufferOffset_ = 0;
        // 1 upcast buffer for bf16 (headSize)
        upcastInputBufferOffset_ = calcTmpBufferOffset_ + sizeof(acc_t) * embedDim_ * 2;
        // 1 upcast temp buffer for bf16 (2 * embed_dim)
        upcastTempBufferOffset_ = upcastInputBufferOffset_ + sizeof(acc_t) * headSize_;
        // 2 sin cos upcast buffer for bf16
        cosSinUpcastBufferOffset_ = upcastTempBufferOffset_ + sizeof(acc_t) * 2 * embedDim_;
        // 2. bf16 path: needs 2 cos sin upcast buffer size
        // 3. fp16 path: needs 2 temporary calculation buffer size
        tempBufferSize_ = cosSinUpcastBufferOffset_ + 2 * embedDim_ * sizeof(acc_t);
        // need to consider upcast the bf16 to fp32, so we might need 4 buffer just in case
        // 2 temporary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp
        // buffer(headSize int8), 1 dst_temp buffer(headSize, int32)
        pipe_->InitBuffer(calcBuf_, tempBufferSize_ /* buffer_size */);
        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
            pipe_->InitBuffer(copyBuf_, loadSize);
        }
    }
    __aicore__ inline void update_mem_offset(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
                                  __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
                                  const int rotDim, const int64_t dstQueryStride, const int64_t dstKeyStride,
                                  const int64_t queryStride, const int64_t keyStride, const int numHeads,
                                  const int numKvHeads, const int headSize, const int64_t idx)
    {
        int64_t pos = positions[idx];
        cosSin_.SetGlobalBuffer(cosSinCache + pos * rotDim_, rotDim_);
        query_.SetGlobalBuffer(query + queryStride * idx, headSize * numHeads_);
        key_.SetGlobalBuffer(key + keyStride * idx, headSize * numKvHeads_);
        queryDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(queryDst) + dstQueryStride * idx,
                                  headSize * numHeads_);
        keyDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(keyDst) + dstKeyStride * idx, headSize * numKvHeads_);
    }
    // compute per head for neox on bf16
    template <typename acc_t_, typename std::enable_if<!std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
    __aicore__ inline void
    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
    {
        // slice dst
        local_dst_t dstX = dst;
        local_dst_t dstY = dst[embedDim_];
        // slice src
        local_scalar_t srcX = src;
        local_scalar_t srcY = src[embedDim_];
        // slice temp buffer
        local_acc_t calcTmpBufferX = calcTmpBuffer;
        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
        // slice upcast input buffer
        local_acc_t upcastBufferX = upcastInputBuffer;
        local_acc_t upcastBufferY = upcastBufferX[embedDim_];
        // dst x calc
        Cast(upcastInputBuffer, src, AscendC::RoundMode::CAST_NONE, headSize_);
        Mul(calcTmpBufferX, upcastBufferX, cos, embedDim_);
        Mul(calcTmpBufferY, upcastBufferY, sin, embedDim_);
        Sub(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
        Cast(dstX, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
        // dst y calc
        Mul(calcTmpBufferX, upcastBufferX, sin, embedDim_);
        Mul(calcTmpBufferY, upcastBufferY, cos, embedDim_);
        Add(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
        Cast(dstY, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
    }
    // compute per head output for neox
    template <typename acc_t_, typename std::enable_if<std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
    __aicore__ inline void
    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
    {
        // slice dst buffer
        local_dst_t dstX = dst;
        local_dst_t dstY = dst[embedDim_];
        // slice src buffer
        local_scalar_t srcX = src;
        local_scalar_t srcY = src[embedDim_];
        // slice temp buffer
        local_acc_t calcTmpBufferX = calcTmpBuffer;
        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
        // dst x calc
        Mul(calcTmpBufferX, srcX, cos, embedDim_);
        Mul(calcTmpBufferY, srcY, sin, embedDim_);
        Sub(dstX, calcTmpBufferX, calcTmpBufferY, embedDim_);
        // dst y calc
        Mul(calcTmpBufferX, srcX, sin, embedDim_);
        Mul(calcTmpBufferY, srcY, cos, embedDim_);
        Add(dstY, calcTmpBufferX, calcTmpBufferY, embedDim_);
    }
    __aicore__ inline void compute_qk(AscendC::GlobalTensor<scalar_t> srcG, AscendC::GlobalTensor<dst_t> dstG,
                                          local_acc_t localCos, local_acc_t localSin, local_acc_t upcastInputBuffer,
                                          local_acc_t calcTmpBuffer, int loopCnt, int tailHeads, int loadStride,
                                          int headNumPerLoad)
    {
        for (int loopNum = 0; loopNum < loopCnt; ++loopNum) {
            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
            local_dst_t dst = outQue_.AllocTensor<dst_t>();
            AscendC::DataCopy(src, srcG[loopNum * loadStride], loadStride);
            inQue_.EnQue(src);
            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
                int elem_num = loadStride / sizeof(scalar_t);
                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
            } else {
                local_mem_copy(dst, srcDeque, loadStride);
            }
            for (int i = 0; i < headNumPerLoad; ++i) {
                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
                             calcTmpBuffer);
            }
            outQue_.EnQue(dst);
            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
            AscendC::DataCopy(dstG[loopNum * loadStride], dstDeque, loadStride);
            outQue_.FreeTensor(dstDeque);
            inQue_.FreeTensor(srcDeque);
        }
        // process tail
        {
            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
            local_dst_t dst = outQue_.AllocTensor<dst_t>();
            AscendC::DataCopy(src, srcG[loopCnt * loadStride], tailHeads * headSize_);
            inQue_.EnQue(src);
            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
                int elem_num = tailHeads * headSize_ / sizeof(scalar_t);
                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
            } else {
                local_mem_copy(dst, srcDeque, tailHeads * headSize_);
            }
            for (int i = 0; i < tailHeads; ++i) {
                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
                             calcTmpBuffer);
            }
            outQue_.EnQue(dst);
            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
            AscendC::DataCopy(dstG[loopCnt * loadStride], dstDeque, tailHeads * headSize_);
            outQue_.FreeTensor(dstDeque);
            inQue_.FreeTensor(srcDeque);
        }
    }
    __aicore__ inline void compute_function()
    {
        local_scalar_t cosSinLocal = inQueSinCos_.AllocTensor<scalar_t>();
        AscendC::DataCopy(cosSinLocal, cosSin_, embedDim_ * 2);
        inQueSinCos_.EnQue(cosSinLocal);
        local_scalar_t localSinCosDeque = inQueSinCos_.DeQue<scalar_t>();
        local_scalar_t localCos = localSinCosDeque;
        local_scalar_t localSin = localSinCosDeque[embedDim_];
        local_acc_t calcTmpBuffer;
        local_acc_t upcastInputBuffer;
        local_acc_t upcastTempBuffer;
        local_acc_t cosSinUpcastBuffer;
        local_acc_t scaleBuffer;
        local_acc_t offsetBuffer;
        calcTmpBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, calcTmpBufferOffset_);
        upcastInputBuffer = calcBuf_.GetWithOffset<acc_t>(headSize_, upcastInputBufferOffset_);
        upcastTempBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, upcastTempBufferOffset_);
        cosSinUpcastBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, cosSinUpcastBufferOffset_);
        local_acc_t cosAccBuffer;
        local_acc_t sinAccBuffer;
        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
            Cast(cosSinUpcastBuffer, localSinCosDeque, AscendC::RoundMode::CAST_NONE, 2 * embedDim_);
            cosAccBuffer = cosSinUpcastBuffer;
            sinAccBuffer = cosSinUpcastBuffer[embedDim_];
        } else {
            cosAccBuffer = localCos;
            sinAccBuffer = localSin;
        }
        constexpr const int loadSizeByElem = loadSize / sizeof(scalar_t);
        int64_t headNumPerLoad = loadSizeByElem / headSize_;
        int64_t loopCnt = numHeads_ / headNumPerLoad;
        int64_t tailHeads = numHeads_ - loopCnt * headNumPerLoad;
        int64_t loadStride = headNumPerLoad * headSize_;
        int64_t loopCntKv = numKvHeads_ / headNumPerLoad;
        int64_t tailHeadsKv = numKvHeads_ - loopCntKv * headNumPerLoad;
        compute_qk(query_, queryDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer,
                       calcTmpBuffer, loopCnt, tailHeads, loadStride, headNumPerLoad);
        compute_qk(key_, keyDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer, calcTmpBuffer,
                       loopCntKv, tailHeadsKv, loadStride, headNumPerLoad);
        inQueSinCos_.FreeTensor(localSinCosDeque);
    }
 private:
    AscendC::TPipe *pipe_;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQue_, inQueSinCos_;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQue_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> copyBuf_;
    AscendC::GlobalTensor<dst_t> queryDst_;
    AscendC::GlobalTensor<dst_t> keyDst_;
    AscendC::GlobalTensor<scalar_t> query_;
    AscendC::GlobalTensor<scalar_t> key_;
    AscendC::GlobalTensor<scalar_t> cosSin_;
    int rotDim_;
    int embedDim_;
    int64_t queryStride_;
    int64_t keyStride_;
    int64_t dstQueryStride_;
    int64_t dstKeyStride_;
    int numHeads_;
    int numKvHeads_;
    int headSize_;
    int calcTmpBufferOffset_;
    int upcastInputBufferOffset_;
    int upcastTempBufferOffset_;
    int cosSinUpcastBufferOffset_;
    int tempBufferSize_;
 };
 // Note: Need to use macro to instaniate all the target functions here, for the current build system dose not support template call in cpp
 // We use C style symbol here for kernel compilation, cpp style kernel entry may lead to compilation failure
 #define ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, NEOX)                                                                            \
    extern "C" __global__ __aicore__ void rope_custom_##NEOX##_##TYPE(                                                          \
        __gm__ int64_t* positions, __gm__ void* queryDst, __gm__ void* keyDst, __gm__ TYPE* query, __gm__ TYPE* key,            \
        __gm__ TYPE* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride,                         \
        const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads,                     \
        const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)                                      \
    {                                                                                                                           \
        AscendC::TPipe pipe;                                                                                                    \
        RotaryEmbedding<TYPE, NEOX> op{};                                                                                       \
        op.init(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,                     \
                queryStride, keyStride, numHeads, numKvHeads, headSize, &pipe);                                                 \
        for (int64_t i = AscendC::GetBlockIdx(); i < numTokens; i += coreNum) {                                                 \
            op.update_mem_offset(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,    \
                      queryStride, keyStride, numHeads, numKvHeads, headSize, i);                                               \
            op.compute_function();                                                                                              \
        }                                                                                                                       \
    }
 #define ROPE_CUSTOM_KERNEL_DECLARE(TYPE)    \
    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, true); \
    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, false);
 // Declare all the kernel entry here
 ROPE_CUSTOM_KERNEL_DECLARE(half)
 #if (__CCE_AICORE__ >= 220)
    ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t)
 #endif
 namespace vllm_ascend {
 #define ROTARY_EMBEDDING_KERNEL_CALL(TYPE)                                                                       \
    if (isNeox)                                                                                                  \
        rope_custom_true_##TYPE<<<blockDim, nullptr, stream>>>(                                                  \
            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);                                       \
    else                                                                                                         \
        rope_custom_false_##TYPE<<<blockDim, nullptr, stream>>>(                                                 \
            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);
 // maximum number for runtime to launch a ascendc kernel.
 // we use this to constrain the maximum number of block size
 static const int64_t maxParallelSize = 65535;
 extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
                                    void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
                                    const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
                                    const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
                                    const int headSize, const int64_t numTokens, const uint32_t loopCnt,
                                    uint32_t aivNum)
 {
    int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize;
    if (type == AscendType::FP16) {
        ROTARY_EMBEDDING_KERNEL_CALL(half);
    }
    #if (__CCE_AICORE__ >= 220)
    else if (type == AscendType::BF16) {
        ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t);
    }
    #endif
    else {
        return;
    }
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/sgmv_expand.cpp
+++ b/csrc/kernels/sgmv_expand.cpp
@@ -0,0 +1,389 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kernel_operator.h"
 #include "types.h"
 template <typename scalar_t>
 class SGMVExpand {
 public:
    using X_T = float;
    using W_T = scalar_t;
    using Y_T = scalar_t;
    static constexpr uint64_t LORA_RANK_8 = 8;
    static constexpr uint64_t LORA_RANK_16 = 16;
    static constexpr uint64_t LORA_RANK_32 = 32;
    static constexpr uint64_t LORA_RANK_64 = 64;
    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
    static constexpr int32_t BUFFER_NUM = 2;
    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
    // Mask is used to control the elements that participate in computation in each iteration.
    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
    // So need to read them all and apply PairReduceSum
    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
    // Same for rank=64, we do not support ranks greater than 64.
    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
 public:
    __aicore__ inline SGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* loraIndices, uint32_t loraIndicesSize,
                                __gm__ void* seqLen, uint32_t seqLenSize, __gm__ void* yIn, __gm__ void* yOut,
                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
    {
        batchSize_ = batchSize;
        numTokensPerCore_ = numTokensPerCore;
        maxLoRARank_ = maxLoRARank;
        outputHiddenDim_ = outputHiddenDim;
        sliceOffset_ = sliceOffset;
        outputFullDim_ = outputFullDim;
        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
        // Each compute iteration would generate not one, but several output elements.
        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
    }
    __aicore__ inline void Process()
    {
        int64_t blockIdx = AscendC::GetBlockIdx();
        int64_t startIdx = blockIdx * numTokensPerCore_;
        int64_t endIdx = startIdx + numTokensPerCore_;
        if (endIdx > batchSize_) {
            endIdx = batchSize_;
        }
        for (int64_t idx = startIdx; idx < endIdx; idx++) {
            yOffset_ = outputFullDim_ * idx + sliceOffset_;
            // Set up LoRA index
            CopyInIndex(idx);
            if (reqLoRAIndex_ < 0) {
                continue;
            }
            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
            CopyInX(idx);
            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
            for (int32_t i = 0; i < numStreamOut; i++) {
                CopyInY(i);
                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
                    CopyInW(i * numStreamInPerOutputTile_ + j);
                    Compute(j * numOutputElementsPerInputTile_);
                }
                ScaleOutput();
                CopyOut(i);
            }
            ComputeLastIteration();
        }
    }
 private:
    __aicore__ inline void CopyInIndex(const int64_t idx)
    {
        // Look up the LoRA index
        int64_t weightIdx = idx;
        uint64_t i = 0;
        for (; i < seqLenGm_.GetSize(); i++) {
            int64_t repeatValue = seqLenGm_.GetValue(i);
            if (weightIdx >= repeatValue) {
                weightIdx -= repeatValue;
                continue;
            }
            break;
        }
        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
    }
    __aicore__ inline void ComputeLastIteration()
    {
        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
        if (remainingY == 0) {
            return;
        }
        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
        int32_t remainingW = remainingY * maxLoRARank_;
        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
        CopyInY(numStreamOut, remainingY);
        int32_t outputIdx = 0;
        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
            Compute(outputIdx * numOutputElementsPerInputTile_);
        }
        if (remainingWForLastRepeat != 0) {
            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
                    remainingWForLastRepeat);
            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
            int32_t pairReduceRepeat16 = 
                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
        }
        ScaleOutput(remainingY);
        CopyOut(numStreamOut, remainingY);
    }
    __aicore__ inline void CopyInX(const int64_t idx)
    {
        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
        if constexpr (std::is_same_v<X_T, float>) {
            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
        } else {
            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
        }
        inQueueX_.EnQue(xLocal);
        xLocal = inQueueX_.DeQue<X_T>();
        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
        // As we are generating multiple output elements with one API invocation,
        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
        if constexpr (std::is_same_v<X_T, float>) {
            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
                for (int32_t j = 0; j < maxLoRARank_; j++) {
                    float entry = xLocal.GetValue(j);
                    xDup.SetValue(i + j, entry);
                }
            }
        } else {
            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
            pipe_barrier(PIPE_V);
            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
                for (int32_t j = 0; j < maxLoRARank_; j++) {
                    float entry = xDup.GetValue(j);
                    xDup.SetValue(i + j, entry);
                }
            }
        }
        inQueueX_.FreeTensor(xLocal);
    }
    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
        inQueueY_.EnQue(yInLocal);
    }
    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
        inQueueW_.EnQue(wLocal);
    }
    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
        pipe_barrier(PIPE_V);
        inQueueY_.FreeTensor(yInLocal);
        Add(yLocal, yLocal, yInLocalFP32, numElements);
        pipe_barrier(PIPE_V);
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
        pipe_barrier(PIPE_V);
        outQueueY_.EnQue<Y_T>(yOutLocal);
    }
    __aicore__ inline void Compute(int32_t progress,
                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
    {
        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
        pipe_barrier(PIPE_V);
        inQueueW_.FreeTensor(wLocal);
        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
        pipe_barrier(PIPE_V);
        if (maxLoRARank_ == LORA_RANK_8) {
            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_16) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_32) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        } else if (maxLoRARank_ == LORA_RANK_64) {
            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
            pipe_barrier(PIPE_V);
        }
    }
    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
    {
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
        outQueueY_.FreeTensor(yOutLocal);
    }
 private:
    AscendC::TPipe* pipe_;
    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
    AscendC::GlobalTensor<X_T> xGm_;
    AscendC::GlobalTensor<W_T> wGm_;
    AscendC::GlobalTensor<Y_T> yInGm_;
    AscendC::GlobalTensor<Y_T> yOutGm_;
    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
    AscendC::GlobalTensor<int64_t> seqLenGm_;
    uint32_t batchSize_;
    uint32_t numTokensPerCore_;
    uint32_t maxLoRARank_;
    uint32_t outputHiddenDim_;
    uint32_t sliceOffset_;
    uint32_t outputFullDim_;
    uint32_t singleLoRAWeightLen_;
    int64_t reqLoRAIndex_;
    uint64_t reqLoRAWeightOffset_;
    uint32_t numOutputElementsPerInputTile_;
    uint32_t numStreamInPerOutputTile_;
    uint64_t yOffset_;
    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
    // reads next 8 consecutive blocks in the second repeat.
    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
    // so we set dstRepStride = 1
    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
    // For xDup we repeatedly use it, so we set src0RepStride = 0
    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
 };
 #define SGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void sgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
                                                             __gm__ void* yIn,  __gm__ void* yOut,                     \
                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        SGMVExpand<TYPE> op(&pipe);                                                                                    \
        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
                yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                                                   \
                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
        op.Process();                                                                                                  \
    }
 // declare all dtype kernel
 SGMV_EXPAND_TYPE_DECLARE(half)
 #if (__CCE_AICORE__ >= 220)
    SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
 #endif
 namespace vllm_ascend {
 extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, 
                             void* loraIndices, uint32_t loraIndicesSize,
                             void* seqLen, uint32_t seqLenSize,
                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
        sgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
                                                        yIn, yOut, batchSize,
                                                        numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, 
                                                        outputFullDim);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
            sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
                                                                  seqLen, seqLenSize, yIn, yOut, batchSize,
                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
                                                                  sliceOffset, outputFullDim);
        #endif
    } else {
        return;
    }
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@@ -0,0 +1,275 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "kernel_operator.h"
 #include "types.h"
 template <typename scalar_t>
 class SGMVShrink {
 public:
    using X_T = scalar_t;
    using W_T = scalar_t;
    using Y_T = float;
    static constexpr uint64_t BUFFER_NUM = 1;
    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
 public:
    __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, uint32_t loraIndicesSize,
                                __gm__ void *seqLen, uint32_t seqLenSize,
                                __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                                uint32_t maxLoRARank, float scale)
    {
        batchSize_ =  batchSize;
        numTokensPerCore_ = numTokensPerCore;
        inputHiddenDim_ = inputHiddenDim;
        maxLoRARank_ = maxLoRARank;
        scale_ = scale;
        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
        incremental_ = inputHiddenDim_ > TILE_LENGTH;
        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        int64_t blockIdx = AscendC::GetBlockIdx();
        int64_t startIdx = blockIdx * numTokensPerCore_;
        int64_t endIdx = startIdx + numTokensPerCore_;
        if (endIdx > batchSize_) {
            endIdx = batchSize_;
        }
        for (int64_t idx = startIdx; idx < endIdx; idx++) {
            // set up LoRA index
            CopyInIndex(idx);
            if (reqLoRAIndex_ < 0) {
                continue;
            }
            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
            if (incremental_) {
                ProcessImpl<true>(idx);
            } else {
                ProcessImpl<false>(idx);
            }
            ScaleOutput();
            CopyOut(idx);
        }
    }
 private:
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void ProcessImpl(const int64_t idx)
    {
        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
        if constexpr (!INCREMENTAL_MODE) {
            CopyInX(idx, 0, inputHiddenDim_);
            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
            pipe_barrier(PIPE_V);
            inQueueX_.FreeTensor(xLocal);
        }
        for (int i = 0; i < maxLoRARank_; i++) {
            float acc(0);
            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
                if constexpr (INCREMENTAL_MODE) {
                    CopyInX(idx, j);
                }
                CopyInW(i, j);
                Compute<INCREMENTAL_MODE>(acc);
            }
            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
            yOutLocal.SetValue(i, acc);
        }
    }
    __aicore__ inline void CopyInIndex(const int64_t idx)
    {
        // look up the LoRA index
        int64_t weightIdx = idx;
        uint64_t i = 0;
        for (; i < seqLenGm_.GetSize(); i++) {
            int64_t repeatValue = seqLenGm_.GetValue(i);
            if (weightIdx >= repeatValue) {
                weightIdx -= repeatValue;
                continue;
            }
            break;
        }
        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
    }
    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
        inQueueX_.EnQue(xLocal);
    }
    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
        inQueueW_.EnQue(wLocal);
    }
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
    {
        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
        if constexpr (INCREMENTAL_MODE) {
            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
            pipe_barrier(PIPE_V);
            inQueueX_.FreeTensor(xLocal);
            inQueueW_.FreeTensor(wLocal);
        } else {
            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
            pipe_barrier(PIPE_V);
            inQueueW_.FreeTensor(wLocal);
        }
        // dot product of the one tile of X and W 
        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
        pipe_barrier(PIPE_V);
        // reduce sum generate one number, which is the summation of all the dot product
        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
        pipe_barrier(PIPE_V);
        acc += wTmpTensor.GetValue(0);
    }
    template <bool INCREMENTAL_MODE>
    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
    {
        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
        if (remaining == 0) {
            return;
        }
        if constexpr (INCREMENTAL_MODE) {
            CopyInX(idx, colIdx, remaining);
        }
        CopyInW(rowIdx, colIdx, remaining);
        Compute<INCREMENTAL_MODE>(acc, remaining);
    }
    __aicore__ inline void ScaleOutput()
    {
        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
        pipe_barrier(PIPE_V);
        outQueueY_.EnQue<Y_T>(yOutLocal);
    }
    __aicore__ inline void CopyOut(const int64_t idx)
    {
        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
        outQueueY_.FreeTensor(yOutLocal);
    }
 private:
    AscendC::TPipe *pipe_;
    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
    AscendC::GlobalTensor<X_T> xGm_;
    AscendC::GlobalTensor<W_T> wGm_;
    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
    AscendC::GlobalTensor<int64_t> seqLenGm_;
    AscendC::GlobalTensor<Y_T> yOutGm_;
    uint32_t batchSize_;
    uint32_t numTokensPerCore_;
    uint32_t inputHiddenDim_;
    uint32_t maxLoRARank_;
    float scale_;
    uint32_t singleLoRAWeightLen_;
    int64_t reqLoRAIndex_;
    uint64_t reqLoRAWeightOffset_;
    bool incremental_;
 };
 #define SGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
    extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
                                                             __gm__ void* y, uint32_t batchSize,                       \
                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
                                                             uint32_t maxLoRARank, float scale)                        \
    {                                                                                                                  \
        AscendC::TPipe pipe;                                                                                           \
        SGMVShrink<TYPE> op(&pipe);                                                                                    \
        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
            y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);                                       \
        op.Process();                                                                                                  \
    }
 // declare all dtype kernel
 SGMV_SHRINK_TYPE_DECLARE(half)
 #if (__CCE_AICORE__ >= 220)
    SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
 #endif
 namespace vllm_ascend {
 extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, 
                             void* loraIndices, uint32_t loraIndicesSize,
                             void* seqLen, uint32_t seqLenSize,
                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
                             uint32_t maxLoRARank, float scale)
 {
    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
    if (type == AscendType::FP16) {
        sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
                                                        y, batchSize, 
                                                        numTokensPerCore, inputHiddenDim, maxLoRARank,
                                                        scale);
    } else if (type == AscendType::BF16) {
        #if (__CCE_AICORE__ >= 220)
            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
                                                                  seqLen, seqLenSize, 
                                                                  y, batchSize,
                                                                  numTokensPerCore, inputHiddenDim, maxLoRARank,
                                                                  scale);
        #endif
    } else {
        return;
    }
 }
 } // namespace vllm_ascend
--- a/csrc/kernels/types.h
+++ b/csrc/kernels/types.h
@@ -0,0 +1,25 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 namespace vllm_ascend {
 enum struct AscendType {
    FP16 = 0,
    BF16 = 1,
    FP32 = 2,
 };
 }
--- a/csrc/kernels/utils.h
+++ b/csrc/kernels/utils.h
@@ -0,0 +1,51 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #include "kernel_type.h"
 namespace vllm_ascend {
 template <typename scalar_t> struct AccType;
 #if (__CCE_AICORE__ >= 220)
 template <> struct AccType<bfloat16_t> {
  using type = float;
 };
 #endif
 template <> struct AccType<half> {
    using type = half;
 };
 template <> struct AccType<float> {
    using type = float;
 };
 template <> struct AccType<int8_t> {
    using type = int;
 };
 template <typename scalar_t>
 __aicore__ inline void local_mem_copy(AscendC::LocalTensor<scalar_t> dst, AscendC::LocalTensor<scalar_t> src, int size)
 {
    constexpr int loadSize = 256 / sizeof(scalar_t);
    int loopCnt = size / loadSize;
    int tailSize = size % loadSize;
    if (loopCnt)
        AscendC::Copy(dst, src, loadSize, loopCnt, {1, 1, 8, 8});
    AscendC::Copy(dst[loopCnt * loadSize], src[loopCnt * loadSize], tailSize, 1, {1, 1, 8, 8});
 }
 } // namespace vllm_ascend
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -0,0 +1,127 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #include <optional>
 #include <torch/library.h>
 #include <vector>
 #include "kernels/types.h"
 #include "torch_npu/csrc/aten/common/from_blob.h"
 namespace vllm_ascend {
  extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
    void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
    const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
    const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
    const int headSize, const int64_t numTokens, const uint32_t loopCnt,
    uint32_t aivNum);
  extern void get_masked_input_and_mask_impl(
    void* stream,
    void* input,
    void* masked_input,
    void* mask_out,
    const int64_t org_vocab_start_index,
    const int64_t org_vocab_end_index,
    const int64_t num_org_vocab_padding, 
    const int64_t added_vocab_start_index,
    const int64_t added_vocab_end_index,
    const int64_t size,
    const uint32_t loop_cnt,
    const uint32_t aiv_num);
  torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
    if (!tensor.is_privateuseone()) {
      throw std::runtime_error("Tensor must be on NPU device");
    }
    // Get the raw data pointer
    void* data_ptr = tensor.data_ptr();
    // Get tensor sizes and strides
    std::vector<int64_t> sizes = tensor.sizes().vec();
    std::vector<int64_t> strides = tensor.strides().vec();
    // Get tensor options (dtype, device)
    auto options = tensor.options();
    // Create a new tensor from the raw data pointer
    auto new_tensor = at_npu::native::from_blob(data_ptr, sizes, strides, options);
    return new_tensor;
  }
  extern void bgmv_shrink_impl(
        AscendType type,
        void *stream,
        void *x,
        void *weight,
        void *indices,
        uint32_t indicesSize,
        void *y, 
        uint32_t batch_size,
        uint32_t num_tokens_per_core,
        uint32_t input_hidden_dim,
        uint32_t lora_rank,
        float scale);
    extern void bgmv_expand_impl(
        AscendType type,
        void *stream,
        void *x,
        void *weight,
        void *indices,
        uint32_t indicesSize,
        void *y,
        void *y_out,
        uint32_t batch_size,
        uint32_t num_tokens_per_core,
        uint32_t lora_rank,
        uint32_t output_hidden_dim,
        uint32_t slice_offset,
        uint32_t output_full_dim);
    extern void sgmv_shrink_impl(
        AscendType type,
        void *stream,
        void *x,
        void *weight,
        void *loraIndices,
        uint32_t loraIndicesSize,
        void *seqLen,
        uint32_t seqLenSize,
        void *y,
        uint32_t batch_size,
        uint32_t num_tokens_per_core,
        uint32_t input_hidden_dim,
        uint32_t lora_rank,
        float scale);
    extern void sgmv_expand_impl(
        AscendType type,
        void *stream,
        void *x,
        void *weight,
        void *loraIndices,
        uint32_t loraIndicesSize,
        void *seqLen,
        uint32_t seqLenSize,
        void *y,
        void *y_out,
        uint32_t batch_size,
        uint32_t num_tokens_per_core,
        uint32_t lora_rank,
        uint32_t output_hidden_dim,
        uint32_t slice_offset,
        uint32_t output_full_dim);
 }
--- a/csrc/torch_binding.cpp
+++ b/csrc/torch_binding.cpp
@@ -0,0 +1,428 @@
 /*
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <torch/extension.h>
 #include <torch/library.h>
 #include <torch/version.h>
 #include <torch_npu/csrc/core/npu/NPUStream.h>
 #include <torch_npu/csrc/framework/OpCommand.h>
 #include <torch_npu/csrc/npu/Module.h>
 #include <pybind11/pybind11.h>
 #include "acl/acl.h"
 #include "ops.h"
 #include "utils.h"
 namespace vllm_ascend {
 AscendType get_dtype_from_torch(at::ScalarType scalarType)
 {
    if (scalarType == at::ScalarType::Float) {
        return AscendType::FP32;
    } else if (scalarType == at::ScalarType::BFloat16) {
        return AscendType::BF16;
    } else {
        return AscendType::FP16;
    }
 }
 std::tuple<at::Tensor, at::Tensor> rotary_embedding(at::Tensor &positions, at::Tensor &query, at::Tensor &key,
    int64_t head_size, at::Tensor &cos_sin_cache,  bool is_neox)
 {
    int32_t deviceId = 0;
    int64_t num_tokens = positions.numel();
    int positions_ndim = positions.dim();
    TORCH_CHECK(
        positions_ndim == 1 || positions_ndim == 2,
        "positions must have shape [num_tokens] or [batch_size, seq_len]");
    if (positions_ndim == 1) {
      TORCH_CHECK(
          query.size(0) == positions.size(0) && key.size(0) == positions.size(0),
          "query, key and positions must have the same number of tokens");
    }
    if (positions_ndim == 2) {
      TORCH_CHECK(
          query.size(0) == positions.size(0) &&
              key.size(0) == positions.size(0) &&
              query.size(1) == positions.size(1) &&
              key.size(1) == positions.size(1),
          "query, key and positions must have the same batch_size and seq_len");
    }
    TORCH_CHECK(head_size % 32 == 0, "rotary_embedding: headSize should be divisible by 32");
    int query_hidden_size = query.numel() / num_tokens;
    int key_hidden_size = key.numel() / num_tokens;
    TORCH_CHECK(query_hidden_size % head_size == 0);
    TORCH_CHECK(key_hidden_size % head_size == 0);
    TORCH_CHECK(is_neox == true, "rotary_embedding: neox=false is not supported as custom kernel in vllm-ascend");
    // Make sure query and key have consistent number of heads
    int num_heads = query_hidden_size / head_size;
    int num_kv_heads = key_hidden_size / head_size;
    TORCH_CHECK(num_heads % num_kv_heads == 0);
    at::Tensor query_dst = at::empty({num_tokens, num_heads, head_size}, query.options());
    at::Tensor key_dst = at::empty({num_tokens, num_kv_heads, head_size}, key.options());
    int rot_dim = cos_sin_cache.size(1);
    int seq_dim_idx = positions_ndim - 1;
    int64_t *position_ids_ptr = positions.data_ptr<int64_t>();
    void *query_dst_ptr = query_dst.data_ptr();
    void *key_dst_ptr = key_dst.data_ptr();
    void *query_ptr = query.data_ptr();
    void *key_ptr = key.data_ptr();
    void *cos_sin_cache_ptr = cos_sin_cache.data_ptr();
    int64_t query_stride = query.stride(seq_dim_idx);
    int64_t key_stride = key.stride(seq_dim_idx);
    int64_t dst_query_stride = query_dst.stride(0);
    int64_t dst_key_stride = key_dst.stride(0);
    at::ScalarType scalar_type = query.scalar_type();
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    at_npu::native::OpCommand cmd;
    cmd.Name("rotary_embedding");
    cmd.SetCustomHandler([scalar_type, is_neox, num_tokens, stream, position_ids_ptr, query_dst_ptr, key_dst_ptr,
                          query_ptr, key_ptr, cos_sin_cache_ptr, rot_dim, query_stride, key_stride,
                          dst_query_stride, dst_key_stride, num_heads, num_kv_heads, head_size]() -> int {
        auto dtype_num = get_dtype_from_torch(scalar_type);
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        uint32_t loop_cnt = (num_tokens + aiv_num - 1) / aiv_num;
        rotary_embedding_impl(dtype_num, is_neox, stream, position_ids_ptr, query_dst_ptr, key_dst_ptr, query_ptr,
                                key_ptr, cos_sin_cache_ptr, rot_dim, query_stride, key_stride, dst_query_stride,
                                dst_key_stride, num_heads, num_kv_heads, head_size, num_tokens, loop_cnt, aiv_num);
        return 0;
    });
    cmd.Run();
    return {query_dst, key_dst};
 }
 std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
    at::Tensor &input,
    const int64_t org_vocab_start_index,
    const int64_t org_vocab_end_index,
    const int64_t num_org_vocab_padding,
    const int64_t added_vocab_start_index,
    const int64_t added_vocab_end_index)
    /*
    https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/vocab_parallel_embedding.py#L161-L198
    Embedding parallelized in the vocabulary dimension.
    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
    make sure it is divisible by the number of model parallel GPUs.
    In order to support various loading methods, we ensure that LoRA-added
    embeddings are always at the end of TP-sharded tensors. In other words,
    we shard base embeddings and LoRA embeddings separately (both padded),
    and place them in the same tensor.
    In this example, we will have the original vocab size = 1010,
    added vocab size = 16 and padding to 64. Therefore, the total
    vocab size with padding will be 1088 (because we first pad 1010 to
    1024, add 16, and then pad to 1088).
    Therefore, the tensor format looks like the following:
    TP1, rank 0 (no sharding):
                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
    TP2, rank 0:
                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
    TP2, rank 1:
                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 | 
    Parameters:
        org_vocab_start_index //base embeddings start
        org_vocab_end_index //base embeddings end
        num_org_vocab_padding //base embeddings padding
        added_vocab_start_index //LoRA embeddings start
        added_vocab_end_index //LoRA embeddings end
    */
 {
    // Input validation
    TORCH_CHECK(input.dim() >= 1, "input must have at least 1 dimension");
    TORCH_CHECK(org_vocab_start_index >= 0, "org_vocab_start_index must be non-negative");
    TORCH_CHECK(org_vocab_end_index >= org_vocab_start_index, "org_vocab_end_index must be greater than org_vocab_start_index");
    TORCH_CHECK(num_org_vocab_padding >= 0, "num_org_vocab_padding must be non-negative");
    TORCH_CHECK(added_vocab_start_index >= org_vocab_end_index, "added_vocab_start_index must be greater than org_vocab_end_index");
    TORCH_CHECK(added_vocab_end_index >= added_vocab_start_index, "added_vocab_end_index must be greater than added_vocab_start_index");
    // Get total number of elements
    int64_t size = input.numel();
    // Create output tensors
    at::Tensor masked_input = at::empty_like(input);
 	at::Tensor mask = at::empty_like(input).to(at::kBool);
    // Get data pointers
    void *input_ptr = input.data_ptr();
    void *masked_input_ptr = masked_input.data_ptr();
    void *mask_ptr = mask.data_ptr();
    // Get current stream
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    // Get scalar type
    at::ScalarType scalar_type = input.scalar_type();
    // Create and configure OpCommand
    at_npu::native::OpCommand cmd;
    cmd.Name("get_masked_input_and_mask");
    cmd.SetCustomHandler([scalar_type, size, stream, 
                         input_ptr, masked_input_ptr, mask_ptr,
                         org_vocab_start_index, org_vocab_end_index,
                         num_org_vocab_padding, added_vocab_start_index,
                         added_vocab_end_index]() -> int {
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        uint32_t loop_cnt = (size + aiv_num - 1) / aiv_num;
        // Call implementation
        get_masked_input_and_mask_impl(
            stream,
            input_ptr,
            masked_input_ptr, 
            mask_ptr,
            org_vocab_start_index,
            org_vocab_end_index,
            num_org_vocab_padding,
            added_vocab_start_index,
            added_vocab_end_index,
            size,
            loop_cnt,
            aiv_num);
        return 0;
    });
    cmd.Run();
    return {masked_input, mask};
 }
 void bgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y, double scale)
 {
    at::ScalarType scalar_type = x.scalar_type();
    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
    TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]");
    TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0),
                "the first dimension of x, y, indices should be same");
    TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out");
    void* x_ptr = x.data_ptr();
    void* weight_ptr = weight.data_ptr();
    void* indices_ptr = indices.data_ptr();
    int indices_size = indices.size(0);
    void* y_ptr = y.data_ptr();
    int batch_size = x.size(0);
    int input_hidden_token = x.size(1);
    uint32_t lora_rank = y.size(1);
    float scale_f = static_cast<float>(scale);
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    at_npu::native::OpCommand cmd;
    cmd.Name("bgmv_shrink");
    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, batch_size, input_hidden_token,
                          lora_rank, scale_f]() -> int {
        auto dtype = get_dtype_from_torch(scalar_type);
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
        bgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, batch_size, num_tokens_per_core,
                         input_hidden_token, lora_rank, scale_f);
        return 0;
    });
    cmd.Run();
    return;
 }
 at::Tensor bgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y,
                       int64_t slice_offset, int64_t slice_size)
 {
    at::ScalarType scalar_type = y.scalar_type();
    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
    TORCH_CHECK(indices.dim() == 1, "indices should be [batch_size]");
    TORCH_CHECK(x.size(0) == y.size(0) && x.size(0) == indices.size(0),
                "the first dimension of x, y, indices should be same");
    TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out");
    TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0");
    TORCH_CHECK((slice_size + slice_offset) <= y.size(1),
                "slice_size + slice_offset should be smaller than the second dimension of y")
    at::Tensor y_out = y;
    void* x_ptr = x.data_ptr();
    void* weight_ptr = weight.data_ptr();
    void* indices_ptr = indices.data_ptr();
    int indices_size = indices.size(0);
    void* y_ptr = y.data_ptr();
    void* y_out_ptr = y_out.data_ptr();
    int batch_size = x.size(0);
    int lora_rank = x.size(1);
    int output_full_dim = y.size(1);
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    at_npu::native::OpCommand cmd;
    cmd.Name("bgmv_expand");
    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, y_out_ptr, batch_size, lora_rank,
                          slice_offset, slice_size, output_full_dim]() -> int {
        auto dtype = get_dtype_from_torch(scalar_type);
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
        bgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, indices_ptr, indices_size, y_ptr, y_out_ptr, batch_size,
                         num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
        return 0;
    });
    cmd.Run();
    return y_out;
 }
 void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
                 at::Tensor &y, double scale)
 {
    at::ScalarType scalar_type = x.scalar_type();
    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
    TORCH_CHECK(x.size(1) > y.size(1), "hidden in should be greater than hidden out");
    void* x_ptr = x.data_ptr();
    void* weight_ptr = weight.data_ptr();
    void* lora_indices_ptr = lora_indices.data_ptr();
    void* seq_len_ptr = seq_len.data_ptr();
    int lora_indices_size = lora_indices.size(0);
    int seq_len_size = seq_len.size(0);
    void* y_ptr = y.data_ptr();
    int batch_size = x.size(0);
    int input_hidden_token = x.size(1);
    uint32_t lora_rank = y.size(1);
    float scale_f = static_cast<float>(scale);
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    at_npu::native::OpCommand cmd;
    cmd.Name("sgmv_shrink");
    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, 
                          seq_len_ptr, seq_len_size, y_ptr, 
                          batch_size, input_hidden_token, lora_rank, scale_f]() -> int {
        auto dtype = get_dtype_from_torch(scalar_type);
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
        sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size,
                         y_ptr, batch_size, 
                         num_tokens_per_core, input_hidden_token, lora_rank, scale_f);
        return 0;
    });
    cmd.Run();
    return;
 }
 at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
                       at::Tensor &y, int64_t slice_offset, int64_t slice_size)
 {
    at::ScalarType scalar_type = y.scalar_type();
    TORCH_CHECK(scalar_type == torch::kHalf || scalar_type == torch::kBFloat16, "only support half and bf16");
    TORCH_CHECK(x.dim() == 2, "x should be [batch_size, hidden_in]");
    TORCH_CHECK(weight.dim() == 3 || weight.dim() == 4,
                "weight should be [num_loras, hidden_out, hidden_in] or [num_loras, 1, hidden_out, hidden_in]");
    TORCH_CHECK(y.dim() == 2, "y should be [batch_size, hidden_out]");
    TORCH_CHECK(x.size(1) <= slice_size, "hidden in should be smaller than hidden out");
    TORCH_CHECK(slice_offset >= 0, "slice offset should be no smaller than 0");
    TORCH_CHECK((slice_size + slice_offset) <= y.size(1),
                "slice_size + slice_offset should be smaller than the second dimension of y")
    at::Tensor y_out = y;
    void* x_ptr = x.data_ptr();
    void* weight_ptr = weight.data_ptr();
    void* lora_indices_ptr = lora_indices.data_ptr();
    void* seq_len_ptr = seq_len.data_ptr();
    int lora_indices_size = lora_indices.size(0);
    int seq_len_size = seq_len.size(0);
    void* y_ptr = y.data_ptr();
    void* y_out_ptr = y_out.data_ptr();
    int batch_size = x.size(0);
    int lora_rank = x.size(1);
    int output_full_dim = y.size(1);
    aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
    at_npu::native::OpCommand cmd;
    cmd.Name("sgmv_expand");
    cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, 
                          batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int {
        auto dtype = get_dtype_from_torch(scalar_type);
        int device_id = 0;
        int64_t aiv_num = 0;
        TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
        int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
        TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
        sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr, 
                         batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
        return 0;
    });
    cmd.Run();
    return y_out;
 }
 } // namespace vllm_ascend
 TORCH_LIBRARY_EXPAND(_C, ops)
 {
    // vLLM-Ascend custom ops
    ops.def("weak_ref_tensor(Tensor input) -> Tensor");
    ops.impl("weak_ref_tensor", torch::kPrivateUse1, &vllm_ascend::weak_ref_tensor);
    // Rotary embedding
    // Apply GPT-NeoX style rotary embedding to query and key.
    ops.def(
        "rotary_embedding(Tensor positions, Tensor! query,"
        "                 Tensor! key, int head_size,"
        "                 Tensor cos_sin_cache, bool is_neox) -> (Tensor query, Tensor key)");
    ops.impl("rotary_embedding", torch::kPrivateUse1, &vllm_ascend::rotary_embedding);
    ops.def(
        "get_masked_input_and_mask(Tensor input, "
        "                         int org_vocab_start_index, "
        "                         int org_vocab_end_index, "
        "                         int num_org_vocab_padding, "
        "                         int added_vocab_start_index, "
        "                         int added_vocab_end_index) -> (Tensor masked_input, Tensor mask)");
    ops.impl("get_masked_input_and_mask", torch::kPrivateUse1, &vllm_ascend::get_masked_input_and_mask);
    ops.def("bgmv_shrink(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y, float scale) -> ()");
    ops.impl("bgmv_shrink", torch::kPrivateUse1, &vllm_ascend::bgmv_shrink);
    ops.def(
        "bgmv_expand(Tensor! x, Tensor! weight, Tensor! indices, Tensor! y,"
        "            int slice_offset, int slice_size) -> Tensor");
    ops.impl("bgmv_expand", torch::kPrivateUse1, &vllm_ascend::bgmv_expand);
    ops.def("sgmv_shrink(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y, float scale) -> ()");
    ops.impl("sgmv_shrink", torch::kPrivateUse1, &vllm_ascend::sgmv_shrink);
    ops.def(
        "sgmv_expand(Tensor! x, Tensor! weight, Tensor! lora_indices, Tensor! seq_len, Tensor! y,"
        "            int slice_offset, int slice_size) -> Tensor");
    ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand);
 }
 REGISTER_EXTENSION(_C)
--- a/csrc/torch_binding_meta.cpp
+++ b/csrc/torch_binding_meta.cpp
@@ -0,0 +1,102 @@
 #include <torch/extension.h>
 #include <torch/library.h>
 #include <torch/version.h>
 #include <torch_npu/csrc/core/npu/NPUStream.h>
 #include <torch_npu/csrc/framework/OpCommand.h>
 #include <torch_npu/csrc/npu/Module.h>
 #include "utils.h"
 /*
 * How to write a meta implementation for a custom operator (meta kernel):
 *
 * Meta implementations are used for shape and dtype inference, tracing, and export.
 * They do NOT perform any real computation or allocate device memory.
 * Instead, they return empty tensors with the correct shapes, dtypes, and device types.
 *
 * Steps to write a meta implementation:
 * 1. The function signature should match the operator's schema, but only use the arguments
 *    necessary to infer output shapes and dtypes.
 * 2. Use input tensor shapes, dtypes, and any relevant arguments to compute the output shapes.
 * 3. Return empty tensors (e.g., at::empty_symint, at::empty_like) with the correct shape and dtype.
 * 4. Do NOT perform any real computation or data movement.
 * 5. Register the meta implementation with the "Meta" dispatch key using TORCH_LIBRARY_IMPL or similar.
 *
 * Example:
 *   std::tuple<at::Tensor, at::Tensor> my_op_meta(
 *       at::Tensor &input, int64_t some_param) {
 *     // Infer output shape based on input and parameters
 *     auto out_shape = ...;
 *     at::Tensor out = at::empty_symint(out_shape, input.options());
 *     // Return empty tensor(s) with correct shape/dtype
 *     return {out, ...};
 *   }
 *
 * See below for real examples.
 */
 namespace vllm_ascend {
 namespace meta {
 std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
  at::Tensor &positions,
  at::Tensor &query,
  at::Tensor &key,
  int64_t head_size, 
  at::Tensor &cos_sin_cache,
  bool is_neox) {
    auto num_tokens = positions.sym_numel();
    auto query_hidden_size = query.sym_numel() / num_tokens;
    auto key_hidden_size = key.sym_numel() / num_tokens;
    auto num_heads = query_hidden_size / head_size;
    auto num_kv_heads = key_hidden_size / head_size;
    at::Tensor query_dst = at::empty_symint({num_tokens, num_heads, head_size}, query.options());
    at::Tensor key_dst = at::empty_symint({num_tokens, num_kv_heads, head_size}, key.options());
    return {query_dst, key_dst};
 }
 std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask_meta(
    at::Tensor &input,
    const int64_t org_vocab_start_index,
    const int64_t org_vocab_end_index,
    const int64_t num_org_vocab_padding,
    const int64_t added_vocab_start_index,
    const int64_t added_vocab_end_index) {
    at::Tensor masked_input = at::empty_like(input);
    at::Tensor mask = at::empty_like(input, input.options().dtype(at::kBool));
    return {masked_input, mask};
 }
 at::Tensor bgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &indices, at::Tensor &y,
                       int64_t slice_offset, int64_t slice_size) {
    at::Tensor y_out = at::empty_like(y);
    return y_out;
 }
 at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at::Tensor &seq_len,
                       at::Tensor &y, int64_t slice_offset, int64_t slice_size) {
    at::Tensor y_out = at::empty_like(y);
    return y_out;
 }
 } // namespace meta
 } // namespace vllm_ascend
 namespace {
  // Register the meta implementations of the custom kernels for symbolic tracing, this will also 
  // the custom kernel been captured into aclgraph
  TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) {
    // Rotary embedding meta implementation
    ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
    // Masked input and mask meta implementation
    ops.impl("get_masked_input_and_mask", &vllm_ascend::meta::get_masked_input_and_mask_meta);
    // Bgmv expand
    ops.impl("bgmv_expand", &vllm_ascend::meta::bgmv_expand_meta);
    // Sgmv expand
    ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta);
 }
 }
--- a/csrc/utils.h
+++ b/csrc/utils.h
@@ -0,0 +1,31 @@
 #pragma once
 #include "kernels/types.h"
 #include <c10/core/ScalarType.h>
 #include <Python.h>
 #define _CONCAT(A, B) A##B
 #define CONCAT(A, B) _CONCAT(A, B)
 #define _STRINGIFY(A) #A
 #define STRINGIFY(A) _STRINGIFY(A)
 // A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
 // could be a macro instead of a literal token.
 #define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
 // A version of the TORCH_LIBRARY_IMPL macro that expands the NAME, i.e. so NAME
 // could be a macro instead of a literal token.
 #define TORCH_LIBRARY_IMPL_EXPAND(NAME, DEVICE, MODULE) \
  TORCH_LIBRARY_IMPL(NAME, DEVICE, MODULE)
 // REGISTER_EXTENSION allows the shared library to be loaded and initialized
 // via python's import statement.
 #define REGISTER_EXTENSION(NAME)                                               \
  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
    return PyModule_Create(&module);                                           \
  }
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,25 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 intl:
 	sphinx-intl build
 	@$(SPHINXBUILD) -b html -D language=zh_CN "$(SOURCEDIR)" "$(BUILDDIR)/html/zh-cn" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,24 @@
 # vLLM Ascend Plugin documents
 Live doc: https://vllm-ascend.readthedocs.io
 ## Build the docs
 ```bash
 # Install dependencies.
 pip install -r requirements-docs.txt
 # Build the docs.
 make clean
 make html
 # Build the docs with translation
 make intl
 # Open the docs with your browser
 python -m http.server -d _build/html/
 ```
 Launch your browser and open:
 - English version: http://localhost:8000
 - Chinese version: http://localhost:8000/zh_CN
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -0,0 +1,10 @@
 sphinx
 sphinx-argparse
 sphinx-book-theme
 sphinx-copybutton
 sphinx-design
 sphinx-togglebutton
 myst-parser
 msgspec
 sphinx-substitution-extensions
 sphinx-intl
--- a/docs/requirements-test.txt
+++ b/docs/requirements-test.txt
@@ -0,0 +1,2 @@
 pytest-asyncio
 pytest-mock
--- a/docs/source/_templates/sections/header.html
+++ b/docs/source/_templates/sections/header.html
@@ -0,0 +1,58 @@
 <!--
  **********************************************************************
  * Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
  * Copyright 2023 The vLLM team.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  * This file is a part of the vllm-ascend project.
  * Adapted from https://github.com/vllm-project/vllm/blob/main/docs/source/_templates/sections/header.html
  **********************************************************************
 -->
 <style>
    .notification-bar {
      width: 100vw;
      display: flex;
      justify-content: center;
      align-items: center;
      font-size: 16px;
    }
    .notification-bar p {
      margin: 0;
    }
    .notification-bar a {
      font-weight: bold;
      text-decoration: none;
    }
    /* Light mode styles (default) */
    .notification-bar {
      background-color: #fff3cd;
      color: #856404;
    }
    .notification-bar a {
      color: #d97706;
    }
    /* Dark mode styles */
    html[data-theme=dark] .notification-bar {
      background-color: #333;
      color: #ddd;
    }
    html[data-theme=dark] .notification-bar a {
      color: #ffa500; /* Brighter color for visibility */
    }
  </style>
  <div class="notification-bar">
    <p>You are viewing the latest developer preview docs. <a href="https://vllm-ascend.readthedocs.io/en/v0.9.1-dev">Click here</a> to view docs for the latest stable release(v0.9.1).</p>
  </div>
--- a/docs/source/assets/multi_node_dp_deepseek.png
+++ b/docs/source/assets/multi_node_dp_deepseek.png
--- a/docs/source/assets/multi_node_dp_kimi.png
+++ b/docs/source/assets/multi_node_dp_kimi.png
--- a/docs/source/community/contributors.md
+++ b/docs/source/community/contributors.md
@@ -0,0 +1,138 @@
 # Maintainers and contributors
 ## Maintainers
 | Name | Github ID | Date |
 |:-----------:|:-----:|:-----:|
 | Xiyuan Wang| [@wangxiyuan](https://github.com/wangxiyuan) | 2025/01 |
 | Yikun Jiang| [@Yikun](https://github.com/Yikun) | 2025/02 |
 | Yi Gan| [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/02 |
 | Shoujian Zheng| [@jianzs](https://github.com/jianzs) | 2025/06 |
 | Wengang Chen | [@ApsarasX](https://github.com/ApsarasX) | 2025/08 |
 | Mengqing Cao | [@MengqingCao](https://github.com/MengqingCao) | 2025/08 |
 ## Contributors
 vLLM Ascend every release would not have been possible without the following contributors:
 Updated on 2025-09-03:
 | Number | Contributor | Date | Commit ID |
 |:------:|:-----------:|:-----:|:---------:|
 | 117 | [@panchao-hub](https://github.com/panchao-hub) | 2025/8/30 | [7215454](https://github.com/vllm-project/vllm-ascend/commit/7215454de6df78f4f9a49a99c5739f8bb360f5bc) |
 | 116 | [@lidenghui1110](https://github.com/lidenghui1110) | 2025/8/29 | [600b08f](https://github.com/vllm-project/vllm-ascend/commit/600b08f7542be3409c2c70927c91471e8de33d03) |
 | 115 | [@NSDie](https://github.com/NSDie) | 2025/8/28 | [1191a64](https://github.com/vllm-project/vllm-ascend/commit/1191a64ae508183d5613711bc98a90250963f83a) |
 | 114 | [@s-jiayang](https://github.com/s-jiayang) | 2025/8/27 | [6a4ec18](https://github.com/vllm-project/vllm-ascend/commit/6a4ec186e731b9516235f4fd30b5b98227513fe7) |
 | 113 | [@LookAround0301](https://github.com/LookAround0301) | 2025/8/22 | [e9fb895](https://github.com/vllm-project/vllm-ascend/commit/e9fb895b10cef37ea634f4d4af71686b09ca9f20) |
 | 112 | [@ZhaoJiangJiang](https://github.com/ZhaoJiangJiang) | 2025/8/22 | [3629bc4](https://github.com/vllm-project/vllm-ascend/commit/3629bc4431d3edb4224761f9036b3bddb16158d6) |
 | 111 | [@NicholasTao](https://github.com/NicholasTao) | 2025/8/20 | [7bec1a9](https://github.com/vllm-project/vllm-ascend/commit/7bec1a9b9c372785551d45682bf11063ec42b216) |
 | 110 | [@gameofdimension](https://github.com/gameofdimension) | 2025/8/19 | [27d038d](https://github.com/vllm-project/vllm-ascend/commit/27d038dc663bf550a35a8f15659493b2abefda07) |
 | 109 | [@liuchenbing](https://github.com/liuchenbing) | 2025/8/19 | [3648d18](https://github.com/vllm-project/vllm-ascend/commit/3648d18e673f15a33a82d6ea95d3a9dd891ff1f5) |
 | 108 | [@LCAIZJ](https://github.com/LCAIZJ) | 2025/8/18 | [03ca2b2](https://github.com/vllm-project/vllm-ascend/commit/03ca2b26ca9ab6b9a12f021b0595a726ee35e223) |
 | 107 | [@haojiangzheng](https://github.com/haojiangzheng) | 2025/8/11 | [eb43a47](https://github.com/vllm-project/vllm-ascend/commit/eb43a475f429192e7509e85e28b1c65d5097f373) |
 | 106 | [@QwertyJack](https://github.com/QwertyJack) | 2025/8/11 | [9c6d108](https://github.com/vllm-project/vllm-ascend/commit/9c6d108330574176f79eea52f989ea6049336af8) |
 | 105 | [@SlightwindSec](https://github.com/SlightwindSec) | 2025/8/5 | [f3b50c5](https://github.com/vllm-project/vllm-ascend/commit/f3b50c54e8243ad8ccefb9b033277fbdd382a9c4) |
 | 104 | [@CaveNightingale](https://github.com/CaveNightingale) | 2025/8/4 | [957c7f1](https://github.com/vllm-project/vllm-ascend/commit/957c7f108d5f0aea230220ccdc18d657229e4030) |
 | 103 | [@underfituu](https://github.com/underfituu) | 2025/8/4 | [e38fab0](https://github.com/vllm-project/vllm-ascend/commit/e38fab011d0b81f3a8e40d9bbe263c283dd4129b) |
 | 102 | [@yangqinghao-cmss](https://github.com/yangqinghao-cmss) | 2025/8/1 | [99fa0ac](https://github.com/vllm-project/vllm-ascend/commit/99fa0ac882c79ae9282940125b042a44ea422757) |
 | 101 | [@pjgao](https://github.com/pjgao) | 2025/7/31 | [6192bc9](https://github.com/vllm-project/vllm-ascend/commit/6192bc95c0e47097836e9be1f30f2a0a6fdca088) |
 | 100 | [@Liccol](https://github.com/Liccol) | 2025/7/31 | [7c90ba5](https://github.com/vllm-project/vllm-ascend/commit/7c90ba5fe8e420b891fdd30df050a33e3767835d) |
 | 99 | [@1024daniel](https://github.com/1024daniel) | 2025/7/31 | [db310c6](https://github.com/vllm-project/vllm-ascend/commit/db310c6ec97b056296f7c2348b90c1d96d0b562a) |
 | 98 | [@zhoux77899](https://github.com/zhoux77899) | 2025/7/30 | [4fcca13](https://github.com/vllm-project/vllm-ascend/commit/4fcca137a70c11daa4070ae014288be154715939) |
 | 97 | [@YuanCheng-coder](https://github.com/YuanCheng-coder) | 2025/7/30 | [34dd24a](https://github.com/vllm-project/vllm-ascend/commit/34dd24adf21fb85a2c413292754b1599832efae2) |
 | 96 | [@hongfugui](https://github.com/hongfugui) | 2025/7/30 | [1dbb888](https://github.com/vllm-project/vllm-ascend/commit/1dbb8882759e4326f5706f6e610674423376c2f3) |
 | 95 | [@Irving11-BKN](https://github.com/Irving11-BKN) | 2025/7/29 | [ca8007f](https://github.com/vllm-project/vllm-ascend/commit/ca8007f584141d3a59b2bcbd4f8ba269c9b7e252) |
 | 94 | [@taoxudonghaha](https://github.com/taoxudonghaha) | 2025/7/29 | [540336e](https://github.com/vllm-project/vllm-ascend/commit/540336edc9db09072a9aaa486fbf7ce625da5b9e) |
 | 93 | [@loukong33](https://github.com/loukong33) | 2025/7/28 | [1a25b0a](https://github.com/vllm-project/vllm-ascend/commit/1a25b0a2ddb23bf4d731ebac4503efaf237b191f) |
 | 92 | [@Ronald1995](https://github.com/Ronald1995) | 2025/7/25 | [e561a2c](https://github.com/vllm-project/vllm-ascend/commit/e561a2c6ec4493b490b13a4a9007d8f451ae0d0f) |
 | 91 | [@ZrBac](https://github.com/ZrBac) | 2025/7/24 | [2ffe051](https://github.com/vllm-project/vllm-ascend/commit/2ffe051859d585df8353d1b9eefb64c44078175a) |
 | 90 | [@SunnyLee151064](https://github.com/SunnyLee151064) | 2025/7/24 | [34571ea](https://github.com/vllm-project/vllm-ascend/commit/34571ea5ae69529758edf75f0252f86ccb4c7184) |
 | 89 | [@shiyuan680](https://github.com/shiyuan680) | 2025/7/23 | [ac0bf13](https://github.com/vllm-project/vllm-ascend/commit/ac0bf133f47ead20f18bf71f9be6dbe05fbd218f) |
 | 88 | [@aidoczh](https://github.com/aidoczh) | 2025/7/21 | [c32eea9](https://github.com/vllm-project/vllm-ascend/commit/c32eea96b73d26268070f57ef98416decc98aff7) |
 | 87 | [@nuclearwu](https://github.com/nuclearwu) | 2025/7/20 | [54f2b31](https://github.com/vllm-project/vllm-ascend/commit/54f2b311848badc86371d269140e729012a60f2c) |
 | 86 | [@pkking](https://github.com/pkking) | 2025/7/18 | [3e39d72](https://github.com/vllm-project/vllm-ascend/commit/3e39d7234c0e5c66b184c136c602e87272b5a36e) |
 | 85 | [@lianyiibo](https://github.com/lianyiibo) | 2025/7/18 | [53d2ea3](https://github.com/vllm-project/vllm-ascend/commit/53d2ea3789ffce32bf3ceb055d5582d28eadc6c7) |
 | 84 | [@xudongLi-cmss](https://github.com/xudongLi-cmss) | 2025/7/2 | [7fc1a98](https://github.com/vllm-project/vllm-ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576) |
 | 83 | [@ZhengWG](https://github.com/) | 2025/7/7 | [3a469de](https://github.com/vllm-project/vllm-ascend/commit/9c886d0a1f0fc011692090b0395d734c83a469de) |
 | 82 | [@wm901115nwpu](https://github.com/) | 2025/7/7 | [a2a47d4](https://github.com/vllm-project/vllm-ascend/commit/f08c4f15a27f0f27132f4ca7a0c226bf0a2a47d4) |
 | 81 | [@Agonixiaoxiao](https://github.com/) | 2025/7/2 | [6f84576](https://github.com/vllm-project/vllm-ascend/commit/7fc1a984890bd930f670deedcb2dda3a46f84576) |
 | 80 | [@zhanghw0354](https://github.com/zhanghw0354) | 2025/7/2 | [d3df9a5](https://github.com/vllm-project/vllm-ascend/commit/9fb3d558e5b57a3c97ee5e11b9f5dba6ad3df9a5) |
 | 79 | [@GDzhu01](https://github.com/GDzhu01) | 2025/6/28 | [de256ac](https://github.com/vllm-project/vllm-ascend/commit/b308a7a25897b88d4a23a9e3d583f4ec6de256ac) |
 | 78 | [@leo-pony](https://github.com/leo-pony) | 2025/6/26 | [3f2a5f2](https://github.com/vllm-project/vllm-ascend/commit/10253449120307e3b45f99d82218ba53e3f2a5f2) |
 | 77 | [@zeshengzong](https://github.com/zeshengzong) | 2025/6/26 | [3ee25aa](https://github.com/vllm-project/vllm-ascend/commit/192dbbcc6e244a8471d3c00033dc637233ee25aa) |
 | 76 | [@sharonyunyun](https://github.com/sharonyunyun) | 2025/6/25 | [2dd8666](https://github.com/vllm-project/vllm-ascend/commit/941269a6c5bbc79f6c1b6abd4680dc5802dd8666) |
 | 75 | [@Pr0Wh1teGivee](https://github.com/Pr0Wh1teGivee) | 2025/6/25 | [c65dd40](https://github.com/vllm-project/vllm-ascend/commit/2fda60464c287fe456b4a2f27e63996edc65dd40) |
 | 74 | [@xleoken](https://github.com/xleoken) | 2025/6/23 | [c604de0](https://github.com/vllm-project/vllm-ascend/commit/4447e53d7ad5edcda978ca6b0a3a26a73c604de0) |
 | 73 | [@lyj-jjj](https://github.com/lyj-jjj) | 2025/6/23 | [5cbd74e](https://github.com/vllm-project/vllm-ascend/commit/5177bef87a21331dcca11159d3d1438075cbd74e) |
 | 72 | [@farawayboat](https://github.com/farawayboat)| 2025/6/21 | [bc7d392](https://github.com/vllm-project/vllm-ascend/commit/097e7149f75c0806774bc68207f0f6270bc7d392)
 | 71 | [@yuancaoyaoHW](https://github.com/yuancaoyaoHW) | 2025/6/20 | [7aa0b94](https://github.com/vllm-project/vllm-ascend/commit/00ae250f3ced68317bc91c93dc1f1a0977aa0b94)
 | 70 | [@songshanhu07](https://github.com/songshanhu07) | 2025/6/18 | [5e1de1f](https://github.com/vllm-project/vllm-ascend/commit/2a70dbbdb8f55002de3313e17dfd595e1de1f)
 | 69 | [@wangyanhui-cmss](https://github.com/wangyanhui-cmss) | 2025/6/12| [40c9e88](https://github.com/vllm-project/vllm-ascend/commit/2a5fb4014b863cee6abc3009f5bc5340c9e88) |
 | 68 | [@chenwaner](https://github.com/chenwaner) | 2025/6/11 | [c696169](https://github.com/vllm-project/vllm-ascend/commit/e46dc142bf1180453c64226d76854fc1ec696169) |
 | 67 | [@yzim](https://github.com/yzim) | 2025/6/11 | [aaf701b](https://github.com/vllm-project/vllm-ascend/commit/4153a5091b698c2270d160409e7fee73baaf701b) |
 | 66 | [@Yuxiao-Xu](https://github.com/Yuxiao-Xu) | 2025/6/9 | [6b853f1](https://github.com/vllm-project/vllm-ascend/commit/6b853f15fe69ba335d2745ebcf14a164d0bcc505) |
 | 65 | [@ChenTaoyu-SJTU](https://github.com/ChenTaoyu-SJTU) | 2025/6/7 | [20dedba](https://github.com/vllm-project/vllm-ascend/commit/20dedba5d1fc84b7ae8b49f9ce3e3649389e2193) |
 | 64 | [@zxdukki](https://github.com/zxdukki) | 2025/6/7 | [87ebaef](https://github.com/vllm-project/vllm-ascend/commit/87ebaef4e4e519988f27a6aa378f614642202ecf) |
 | 63 | [@sdmyzlp](https://github.com/sdmyzlp) | 2025/6/7 | [3640c60](https://github.com/vllm-project/vllm-ascend/commit/3640c60b0eb4d4cb104e20bfa406d3f1d17920a7) |
 | 62 | [@weijinqian0](https://github.com/weijinqian0) | 2025/6/7 | [e9ada68](https://github.com/vllm-project/vllm-ascend/commit/e9ada685ece798f9fe0d4a287e3f5246a8a7207b) |
 | 61 | [@hahazhky](https://github.com/hahazhky) | 2025/6/6 | [0b12c2a](https://github.com/vllm-project/vllm-ascend/commit/0b12c2acf7d9fd192beebebf662298067d9a5435) |
 | 60 | [@depeng1994](https://github.com/depeng1994) | 2025/6/6 | [6b094a2](https://github.com/vllm-project/vllm-ascend/commit/6b094a2bd49a8a41eb3647568b2d9e5b337db81f) |
 | 59 | [@David9857](https://github.com/David9857) | 2025/6/5 | [78431b3](https://github.com/vllm-project/vllm-ascend/commit/78431b34694dfa3c8f54ed7cc626660318557927) |
 | 58 | [@momo609](https://github.com/momo609) | 2025/6/5 | [908a851](https://github.com/vllm-project/vllm-ascend/commit/908a851a776cfd9051cc062119e6ec481561c6f7) |
 | 57 | [@zhangxinyuehfad](https://github.com/zhangxinyuehfad) | 2025/6/5 | [7737aaa](https://github.com/vllm-project/vllm-ascend/commit/7737aaa40f699b233a35fb61e908b687adc1e2e5) |
 | 56 | [@NINGBENZHE](https://github.com/NINGBENZHE) | 2025/6/3 | [6ec64a3](https://github.com/vllm-project/vllm-ascend/commit/6ec64a3f9686df65b5a23a41aa301e669db19099) |
 | 55 | [@XWFAlone](https://github.com/XWFAlone) | 2025/5/30 | [3442fbd](https://github.com/vllm-project/vllm-ascend/commit/3442fbdb235b4c6d72c2bc64a49707a7bd89958e) |
 | 54 | [@YisongJiang](https://github.com/YisongJiang) | 2025/5/29 | [90afaf6](https://github.com/vllm-project/vllm-ascend/commit/90afaf6306f680307462becf3c78585737579851) |
 | 53 | [@ponix-j](https://github.com/ponix-j) | 2025/5/23 | [df58fb8](https://github.com/vllm-project/vllm-ascend/commit/df58fb80eee24139fc61c495be3ce79cf81b3f73) |
 | 52 | [@ttanzhiqiang](https://github.com/ttanzhiqiang) | 2025/5/23 | [dc6172e](https://github.com/vllm-project/vllm-ascend/commit/dc6172efd3860ce95b40a7b3e93611f875f06d40) |
 | 51 | [@yangpuPKU](https://github.com/yangpuPKU) | 2025/5/23 | [46df67a](https://github.com/vllm-project/vllm-ascend/commit/46df67a5e9ab73fade08cbb2d8c0155cee7316d1) |
 | 50 | [@wonderful199082](https://github.com/wonderful199082) | 2025/5/20 | [5cf9ff1](https://github.com/vllm-project/vllm-ascend/commit/5cf9ff18e91b0b7031c258d71a257b8e24689763) |
 | 49 | [@22dimensions](https://github.com/22dimensions) | 2025/5/17 | [a8730e7](https://github.com/vllm-project/vllm-ascend/commit/a8730e7a3c4ac6c4b39a5946c943252fdea6cce5) |
 | 48 | [@cxcxflying](https://github.com/cxcxflying) | 2025/5/13 | [e564470](https://github.com/vllm-project/vllm-ascend/commit/e56447033889ca95df512208cab22ef832bfdf07) |
 | 47 | [@NeverRaR](https://github.com/NeverRaR) | 2025/5/12 | [efabd72](https://github.com/vllm-project/vllm-ascend/commit/efabd722eb757e49aa309c173bbec91ca8c4ced1) |
 | 46 | [@chris668899](https://github.com/chris668899) | 2025/5/8 | [6c02088](https://github.com/vllm-project/vllm-ascend/commit/6c020883a8332b5c519f4f6502733edd9b391c2b) |
 | 45 | [@sunbaosong](https://github.com/sunbaosong) | 2025/5/6 | [d6bfae8](https://github.com/vllm-project/vllm-ascend/commit/d6bfae8eeebedf677b643b712d367a3a69c9cce4) |
 | 44 | [@ApsarasX](https://github.com/ApsarasX) | 2025/4/29 | [87975fa](https://github.com/vllm-project/vllm-ascend/commit/87975fa058fe3f90d204ded42a08989a8dcb413e) |
 | 43 | [@zouyida2052](https://github.com/zouyida2052) | 2025/4/28 | [b9528e6](https://github.com/vllm-project/vllm-ascend/commit/b9528e6ecdc417cf444e55a0ce4a2bafdef0ea3b) |
 | 42 | [@ZhengJun9](https://github.com/ZhengJun9) | 2025/4/28 | [1791113](https://github.com/vllm-project/vllm-ascend/commit/17911138c90d78a76bd691e9dcb56763db35b19f) |
 | 41 | [@linfeng-yuan](https://github.com/linfeng-yuan) | 2025/4/28 | [2204e4d](https://github.com/vllm-project/vllm-ascend/commit/2204e4d08f8e10cf9c30154a14eaa5ca956c2acd) |
 | 40 | [@jianzs](https://github.com/jianzs) | 2025/4/27 | [fa4a5d9](https://github.com/vllm-project/vllm-ascend/commit/fa4a5d980e8845a88b9162cf169f0a5ab230f8a5) |
 | 39 | [@fakeYan](https://github.com/fakeYan) | 2025/4/23 | [05bdcbe](https://github.com/vllm-project/vllm-ascend/commit/05bdcbeae47c7fcb9b1c30cad059abf1d40b5421) |
 | 38 | [@RongRongStudio](https://github.com/RongRongStudio) | 2025/4/22 | [848e041](https://github.com/vllm-project/vllm-ascend/commit/848e041a54732c923660dd02daf8e9bf439736a2) |
 | 37 | [@paulyu12](https://github.com/paulyu12) | 2025/4/17 | [697908f](https://github.com/vllm-project/vllm-ascend/commit/697908f5cd7c65a3a917ec1a962b0886efc98c7e) |
 | 36 | [@heartStrive1998](https://github.com/heartStrive1998) | 2025/4/16 | [2f15503](https://github.com/vllm-project/vllm-ascend/commit/2f155039dc3997640854daef469bbf0cb77dc6ed) |
 | 35 | [@eeethenQ](https://github.com/eeethenQ) | 2025/4/15 | [44a8301](https://github.com/vllm-project/vllm-ascend/commit/44a8301424ded94dae83e13b837f5bfc0a1bfc15) |
 | 34 | [@wxsIcey](https://github.com/wxsIcey) | 2025/4/10 | [d05ea17](https://github.com/vllm-project/vllm-ascend/commit/d05ea17427b82a506b97409a7de8359f18f565f7) |
 | 33 | [@yx0716](https://github.com/yx0716) | 2025/4/8 | [5d62393](https://github.com/vllm-project/vllm-ascend/commit/5d6239306be9b0f5ac6dbaa137048c372a92ff20) |
 | 32 | [@celestialli](https://github.com/celestialli) | 2025/4/7 | [2b765dc](https://github.com/vllm-project/vllm-ascend/commit/2b765dcc4974b1bafc26ff5da817ce7e652f0eb0) |
 | 31 | [@hfadzxy](https://github.com/hfadzxy) | 2025/3/30 | [7beb433](https://github.com/vllm-project/vllm-ascend/commit/7beb4339dc8047af9ef64db1d0a8c59ddbb3709f) |
 | 30 | [@wuhuikx](https://github.com/wuhuikx) | 2025/3/28 | [57a84bb](https://github.com/vllm-project/vllm-ascend/commit/57a84bb7befeaa0dc62aa35fa406e4d6affbfcca) |
 | 29 | [@zzzzwwjj](https://github.com/zzzzwwjj) | 2025/3/28 | [12390af](https://github.com/vllm-project/vllm-ascend/commit/12390af075962456ecc8233d8dcce7064b75f390) |
 | 28 | [@ganyi1996ppo](https://github.com/ganyi1996ppo) | 2025/3/28 | [27e86b9](https://github.com/vllm-project/vllm-ascend/commit/27e86b993a6a810d818143ec9dbfc439a419fa77) |
 | 27 | [@ZhengZhenyu](https://github.com/ZhengZhenyu) | 2025/3/26 | [0b5a964](https://github.com/vllm-project/vllm-ascend/commit/0b5a9643fd6c3240d7ede669e37209d7ff433841) |
 | 26 | [@baifanxxx](https://github.com/baifanxxx) | 2025/3/26 | [1225052](https://github.com/vllm-project/vllm-ascend/commit/122505208ff6284f409846ca7294f4a4b9883285) |
 | 25 | [@rjg-lyh](https://github.com/rjg-lyh) | 2025/3/13 | [6512470](https://github.com/vllm-project/vllm-ascend/commit/65124705fb39d4cc2c94c80254421e067a82fe50) |
 | 24 | [@xiemingda-1002](https://github.com/xiemingda-1002) | 2025/3/12 | [59ea23d](https://github.com/vllm-project/vllm-ascend/commit/59ea23d0d394879d7f33de6fd22242539b9c3cc5) |
 | 23 | [@yiz-liu](https://github.com/yiz-liu) | 2025/3/11 | [0db6670](https://github.com/vllm-project/vllm-ascend/commit/0db6670bfab8cb1d84c9e7270df0a1d42d6ce7ca) |
 | 22 | [@new-TonyWang](https://github.com/new-TonyWang) | 2025/3/11 | [dfb4e23](https://github.com/vllm-project/vllm-ascend/commit/dfb4e23e9d820ac992a071c123bbe983c7b01b2e) |
 | 21 | [@mengwei805](https://github.com/mengwei805) | 2025/3/6 | [8fcf3d1](https://github.com/vllm-project/vllm-ascend/commit/8fcf3d1704084626db35c5dc82ade446508598d4) |
 | 20 | [@baymax591](https://github.com/baymax591) | 2025/2/28 | [e8131b9](https://github.com/vllm-project/vllm-ascend/commit/e8131b99cf199f50a304e6e6fb125a1b95bcc92b) |
 | 19 | [@dependabot](https://github.com/dependabot) | 2025/2/27 | [a5564ed](https://github.com/vllm-project/vllm-ascend/commit/a5564ed5d8fd9818936a22d9ea35951a27513b4c) |
 | 18 | [@shink](https://github.com/shink) | 2025/2/27 | [6aed833](https://github.com/vllm-project/vllm-ascend/commit/6aed83335cbe92fd0b8ef07c28966a753d012ccb) |
 | 17 | [@wwfu109](https://github.com/wwfu109) | 2025/2/27 | [b074047](https://github.com/vllm-project/vllm-ascend/commit/b07404766bdaf6e3cebc5cb0aba89a247501302e) |
 | 16 | [@kunpengW-code](https://github.com/kunpengW-code) | 2025/2/26 | [ca807ce](https://github.com/vllm-project/vllm-ascend/commit/ca807ce49ed64aa89242f5ae29b9862a77648b45) |
 | 15 | [@Yaphets24](https://github.com/Yaphets24) | 2025/2/22 | [d0b3cb4](https://github.com/vllm-project/vllm-ascend/commit/d0b3cb4fa79d5fc7f8245a3c68885ce1fa030ba4) |
 | 14 | [@noemotiovon](https://github.com/noemotiovon) | 2025/2/21 | [202b39a](https://github.com/vllm-project/vllm-ascend/commit/202b39a38c2869b0ecc3df486550fb555a2eb0c0) |
 | 13 | [@SidaoY](https://github.com/SidaoY) | 2025/2/18 | [718c763](https://github.com/vllm-project/vllm-ascend/commit/718c7638555d12cd43ea2a9e497e185778b68595) |
 | 12 | [@ShiyaNiu](https://github.com/ShiyaNiu) | 2025/2/17 | [36ea38f](https://github.com/vllm-project/vllm-ascend/commit/36ea38fde56437ff1745bd95cd8d9e02a6578d38) |
 | 11 | [@ji-huazhong](https://github.com/ji-huazhong) | 2025/2/12 | [c8b57d1](https://github.com/vllm-project/vllm-ascend/commit/c8b57d10b24efcd9b4fadeb66cfbf66aa3dd5f82) |
 | 10 | [@Angazenn](https://github.com/Angazenn) | 2025/2/11 | [7637759](https://github.com/vllm-project/vllm-ascend/commit/7637759056028839c74960d9cfd3ce6275ee5d35) |
 | 9 | [@whx-sjtu](https://github.com/whx-sjtu) | 2025/2/7 | [8fc5dc9](https://github.com/vllm-project/vllm-ascend/commit/8fc5dc966aaf4e174d1ec0d1902c40289411ec0e) |
 | 8 | [@zouyida2002](https://github.com/zouyida2002) | 2025/2/7 | [4495fc6](https://github.com/vllm-project/vllm-ascend/commit/4495fc68389e3fb1ef14534c202948931e38446b) |
 | 7 | [@hw_whx](https://github.com/hw_whx) | 2025/2/7 | [7d16772](https://github.com/vllm-project/vllm-ascend/commit/7d1677263bc6628ade33bb780455e0f6e5b9b27a) |
 | 6 | [@MengqingCao](https://github.com/MengqingCao) | 2025/2/6 | [7d9ae22](https://github.com/vllm-project/vllm-ascend/commit/7d9ae22ecb6dc3ea4e720e5109cf46e1ae7da730) |
 | 5 | [@Potabk](https://github.com/Potabk) | 2025/2/6 | [8cb5615](https://github.com/vllm-project/vllm-ascend/commit/8cb5615fb010b34c2f4f89e03e6257bfee851f86) |
 | 4 | [@wangxiyuan](https://github.com/wangxiyuan) | 2025/2/6 | [a48b9ad](https://github.com/vllm-project/vllm-ascend/commit/a48b9addefd292af523644411d4ff4142dd4bc66) |
 | 3 | [@shen-shanshan](https://github.com/shen-shanshan) | 2025/2/6 | [bfccf73](https://github.com/vllm-project/vllm-ascend/commit/bfccf739e2fe121b54d9b198c2ec205a9379190e) |
 | 2 | [@Yikun](https://github.com/Yikun) | 2025/2/5 | [d5e7756](https://github.com/vllm-project/vllm-ascend/commit/d5e7756028bd5884ade96b654555c375770a2f64) |
 | 1 | [@simon-mo](https://github.com/simon-mo) | 2025/1/29 | [eb28342](https://github.com/vllm-project/vllm-ascend/commit/eb283428ddc17207b6866118f9bc15454b5b8801) |
--- a/docs/source/community/governance.md
+++ b/docs/source/community/governance.md
@@ -0,0 +1,48 @@
 # Governance
 ## Mission
 As a vital component of vLLM, the vLLM Ascend project is dedicated to providing an easy, fast, and cheap LLM Serving for Everyone on Ascend NPU, and to actively contribute to the enrichment of vLLM.
 ## Principles
 vLLM Ascend follows the vLLM community's code of conduct：[vLLM - CODE OF CONDUCT](https://github.com/vllm-project/vllm/blob/main/CODE_OF_CONDUCT.md)
 ## Governance - Mechanics
 vLLM Ascend is an open-source project under the vLLM community, where the authority to appoint roles is ultimately determined by the vLLM community. It adopts a hierarchical technical governance structure.
 - Contributor:
    **Responsibility:** Help new contributors on boarding, handle and respond to community questions, review RFCs, code
    **Requirements:** Complete at least 1 contribution. Contributor is someone who consistently and actively participates in a project, included but not limited to issue/review/commits/community involvement.
    Contributors will be empowered [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) Github repo `Triage` permissions (`Can read and clone this repository. Can also manage issues and pull requests`) to help community developers collaborate more efficiently.
 - Maintainer:
    **Responsibility:** Develop the project's vision and mission. Maintainers are responsible for driving the technical direction of the entire project and ensuring its overall success, possessing code merge permissions. They formulate the roadmap, review contributions from community members, continuously contribute code, and actively engage in community activities (such as regular meetings/events).
    **Requirements:** Deep understanding of ‌vLLM‌ and ‌vLLM Ascend‌ codebases, with a commitment to sustained code contributions. Competency in ‌design/development/PR review workflows‌.
    - **Review Quality‌:** Actively participate in community code reviews, ensuring high-quality code integration.
    - **Quality Contribution‌:** Successfully develop and deliver at least one major feature while maintaining consistent high-quality contributions.
    - **Community Involvement‌:** Actively address issues, respond to forum inquiries, participate in discussions, and engage in community-driven tasks.
    Requires approval from existing Maintainers. The vLLM community has the final decision-making authority.
    Maintainer will be empowered [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) Github repo write permissions (`Can read, clone, and push to this repository. Can also manage issues and pull requests`).
 ## Nominating and Removing Maintainers
 ### The Principles
 - Membership in vLLM Ascend is given to individuals on merit basis after they demonstrated strong expertise of the vLLM / vLLM Ascend through contributions, reviews and discussions.
 - For membership in the maintainer group the individual has to demonstrate strong and continued alignment with the overall vLLM / vLLM Ascend principles.
 - Light criteria of moving module maintenance to ‘emeritus’ status if they don’t actively participate over long periods of time.
 - The membership is for an individual, not a company.
 ### Nomination and Removal
 - Nomination: Anyone can nominate someone to become a maintainer (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info around the strength of the candidate to be a maintainer, include but not limited to review quality, quality contribution, community involvement.
 - Removal: Anyone can nominate a person to be removed from maintainer position (include self-nominate). All existing maintainers are responsible for evaluating the nomination. The nominator should provide nominee's info, include but not limited to lack of activity, conflict with the overall direction and other information that makes them unfit to be a maintainer.
--- a/docs/source/community/user_stories/index.md
+++ b/docs/source/community/user_stories/index.md
@@ -0,0 +1,19 @@
 # User Stories
 Read case studies on how users and developers solves real, everyday problems with vLLM Ascend
 - [LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform for training and fine-tuning large language models, it supports vLLM Ascend to speed up inference since [LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x performance enhancement of inference.
 - [Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge library designed for post-training foundation models using advanced techniques like SFT, PPO and DPO, it uses vLLM Ascend since [v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to support RLHF on Ascend NPU.
 - [MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference engine acceleration plug-in library developed by Huawei on Ascend hardware, which includes self-developed large language model optimization algorithms and optimizations related to the inference engine framework. It supports vLLM Ascend since [2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-turbo-0001.html).
 - [GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU cluster manager for running AI models. It supports vLLM Ascend since [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more GPUStack performance evaluation info on [link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew).
 - [verl](https://github.com/volcengine/verl) is a flexible, efficient and production-ready RL training library for large language models (LLMs), uses vLLM Ascend since [v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more info on [verl x Ascend Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html).
 :::{toctree}
 :caption: More details
 :maxdepth: 1
 llamafactory
 :::
--- a/docs/source/community/user_stories/llamafactory.md
+++ b/docs/source/community/user_stories/llamafactory.md
@@ -0,0 +1,19 @@
 # LLaMA-Factory
 **About / Introduction**
 [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use and efficient platform for training and fine-tuning large language models. With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally without writing any code.
 LLaMA-Facotory users need to evaluate and inference the model after fine-tuning the model.
 **The Business Challenge**
 LLaMA-Factory used transformers to perform inference on Ascend NPU, but the speed was slow.
 **Solving Challenges and Benefits with vLLM Ascend**
 With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the performance of LLaMA-Factory in the model inference stage has been significantly improved. According to the test results, the inference speed of LLaMA-Factory has been increased to 2x compared to the transformers version.
 **Learn more**
 See more about LLaMA-Factory and how it uses vLLM Ascend for inference on the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html).
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -0,0 +1,131 @@
 # Versioning policy
 Starting with vLLM 0.7.x, the vLLM Ascend Plugin ([vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)) project follows the [PEP 440](https://peps.python.org/pep-0440/) to publish matching with vLLM ([vllm-project/vllm](https://github.com/vllm-project/vllm)).
 ## vLLM Ascend Plugin versions
 Each vLLM Ascend release will be versioned: `v[major].[minor].[micro][rcN][.postN]` (such as
 `v0.7.3rc1`, `v0.7.3`, `v0.7.3.post1`)
 - **Final releases**: will typically be released every **3 months**, will take the vLLM upstream release plan and Ascend software product release plan into comprehensive consideration.
 - **Pre releases**: will typically be released **on demand**, ending with rcN, represents the Nth release candidate version, to support early testing by our users prior to a final release.
 - **Post releases**: will typically be released **on demand** to support to address minor errors in a final release. It's different from [PEP-440 post release note](https://peps.python.org/pep-0440/#post-releases) suggestion, it will contain actual bug fixes considering that the final release version should be matched strictly with the vLLM final release version (`v[major].[minor].[micro]`). The post version has to be published as a patch version of the final release.
 For example:
 - `v0.7.x`: it's the first final release to match the vLLM `v0.7.x` version.
 - `v0.7.3rc1`: will be the first pre version of vLLM Ascend.
 - `v0.7.3.post1`: will be the post release if the `v0.7.3` release has some minor errors.
 ## Release Compatibility Matrix
 Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  | MindIE Turbo |
 |-------------|--------------|------------------|-------------|--------------------|--------------|
 | v0.10.1rc1  | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12   | 8.2.RC1     | 2.7.1 / 2.7.1.dev20250724            |              |
 | v0.10.0rc1  | v0.10.0      | >= 3.9, < 3.12   | 8.2.RC1     | 2.7.1 / 2.7.1.dev20250724            |              |
 | v0.9.2rc1   | v0.9.2       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1.post1.dev20250619      |              |
 | v0.9.1      | v0.9.1       | >= 3.9, < 3.12   | 8.2.RC1     | 2.5.1 / 2.5.1.post1 |              |
 | v0.9.1rc3   | v0.9.1       | >= 3.9, < 3.12   | 8.2.RC1     | 2.5.1 / 2.5.1.post1 |              |
 | v0.9.1rc2   | v0.9.1       | >= 3.9, < 3.12   | 8.2.RC1     | 2.5.1 / 2.5.1.post1|              |
 | v0.9.1rc1   | v0.9.1       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1.post1.dev20250528      |              |
 | v0.9.0rc2   | v0.9.0       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.9.0rc1   | v0.9.0       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.5rc1   | v0.8.5.post1 | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |              |
 | v0.8.4rc2   | v0.8.4       | >= 3.9, < 3.12   | 8.0.0       | 2.5.1 / 2.5.1      |              |
 | v0.7.3.post1| v0.7.3       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |   2.0rc1     |
 | v0.7.3      | v0.7.3       | >= 3.9, < 3.12   | 8.1.RC1     | 2.5.1 / 2.5.1      |   2.0rc1     |
 ## Release cadence
 ### release window
 | Date       | Event                                     |
 |------------|-------------------------------------------|
 | 2025.09.04 | Release candidates, v0.10.1rc1            |
 | 2025.09.03 | v0.9.1 Final release                      |
 | 2025.08.22 | Release candidates, v0.9.1rc3             |
 | 2025.08.07 | Release candidates, v0.10.0rc1            |
 | 2025.08.04 | Release candidates, v0.9.1rc2             |
 | 2025.07.11 | Release candidates, v0.9.2rc1             |
 | 2025.06.22 | Release candidates, v0.9.1rc1             |
 | 2025.06.10 | Release candidates, v0.9.0rc2             |
 | 2025.06.09 | Release candidates, v0.9.0rc1             |
 | 2025.05.29 | v0.7.x post release, v0.7.3.post1         |
 | 2025.05.08 | v0.7.x Final release, v0.7.3              |
 | 2025.05.06 | Release candidates, v0.8.5rc1             |
 | 2025.04.28 | Release candidates, v0.8.4rc2             |
 | 2025.04.18 | Release candidates, v0.8.4rc1             |
 | 2025.03.28 | Release candidates, v0.7.3rc2             |
 | 2025.03.14 | Release candidates, v0.7.3rc1             |
 | 2025.02.19 | Release candidates, v0.7.1rc1             |
 ## Branch policy
 vLLM Ascend has main branch and dev branch.
 - **main**: main branch，corresponds to the vLLM main branch and latest 1 or 2 release version. It is continuously monitored for quality through Ascend CI.
 - **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
 Usually, a commit should be ONLY first merged in the main branch, and then backported to the dev branch to reduce maintenance costs as much as possible.
 ### Maintenance branch and EOL:
 The branch status will be in one of the following states:
 | Branch            | Time frame                       | Summary                                                              |
 |-------------------|----------------------------------|----------------------------------------------------------------------|
 | Maintained        | Approximately 2-3 minor versions | All bugfixes are appropriate. Releases produced, CI commitment.      |
 | Unmaintained      | Community interest driven        | All bugfixes are appropriate. No Releases produced, No CI commitment |
 | End of Life (EOL) | N/A                              | Branch no longer accepting changes                                   |
 ### Branch state
 Note that vLLM Ascend will only be released for a certain vLLM release version rather than all versions. Hence, You might see only part of versions have dev branches (such as only `0.7.1-dev` / `0.7.3-dev` but no `0.7.2-dev`), this is as expected.
 Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM Ascend version branch and support its latest version (for example, we plan to support version 0.7.3) as following shown:
 | Branch     | Status       | Note                                 |
 |------------|--------------|--------------------------------------|
 | main       | Maintained   | CI commitment for vLLM main branch and vLLM 0.9.2 branch   |
 | v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
 | v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version |
 | v0.7.1-dev | Unmaintained | Replaced by v0.7.3-dev               |
 ### Feature branches
 | Branch     | Status       | RFC link                              | Merge plan | Mentor |
 |------------|--------------|---------------------------------------|------------|--------|
 |rfc/long_seq_optimization|Maintained|https://github.com/vllm-project/vllm/issues/22693|930|wangxiyuan|
 - Branch: The feature branch should be created with a prefix `rfc/` followed by the feature name, such as `rfc/feature-name`.
 - Status: The status of the feature branch is `Maintained` until it is merged into the main branch or deleted.
 - RFC link: The feature branch should be created with a corresponding RFC issue. The creation of a feature branch requires an RFC and approval from at least two maintainers.
 - Merge plan: The final goal of a feature branch is to merge it into the main branch. If it exceeds 3 months, the mentor maintainer should evaluate whether to delete the branch.
 - Mentor: The mentor should be a vLLM Ascend maintainer who is responsible for the feature branch.
 ### Backward compatibility
 For main branch, vLLM Ascend should works with vLLM main branch and latest 1 or 2 release version. So to ensure the backward compatibility, we will do the following:
 - Both main branch and target vLLM release is tested by Ascend E2E CI. For example, currently, vLLM main branch and vLLM 0.8.4 are tested now.
 - For code changes, we will make sure that the changes are compatible with the latest 1 or 2 vLLM release version as well. In this case, vLLM Ascend introduced a version check machinism inner the code. It'll check the version of installed vLLM package first to decide which code logic to use. If users hit the `InvalidVersion` error, it sometimes means that they have installed an dev/editable version of vLLM package. In this case, we provide the env variable `VLLM_VERSION` to let users specify the version of vLLM package to use.
 - For documentation changes, we will make sure that the changes are compatible with the latest 1 or 2 vLLM release version as well. Note should be added if there are any breaking changes.
 ## Document Branch Policy
 To reduce maintenance costs, **all branch documentation content should remain consistent, and version differences can be controlled via variables in [docs/source/conf.py](https://github.com/vllm-project/vllm-ascend/blob/main/docs/source/conf.py)**. While this is not a simple task, it is a principle we should strive to follow.
 | Version | Purpose | Code Branch |
 |-----|-----|---------|
 | latest | Doc for the latest dev branch | vX.Y.Z-dev (Will be `main` after the first final release) |
 | version | Doc for historical released versions | Git tags, like vX.Y.Z[rcN] |
 | stable（not yet released） | Doc for latest final release branch | Will be `vX.Y.Z-dev` after the first official release |
 As shown above:
 - `latest` documentation: Matches the current maintenance branch `vX.Y.Z-dev` (Will be `main` after the first final release). Continuously updated to ensure usability for the latest release.
 - `version` documentation: Corresponds to specific released versions (e.g., `v0.7.3`, `v0.7.3rc1`). No further updates after release.
 - `stable` documentation (**not yet released**): Official release documentation. Updates are allowed in real-time after release, typically based on vX.Y.Z-dev. Once stable documentation is available, non-stable versions should display a header warning: `You are viewing the latest developer preview docs. Click here to view docs for the latest stable release.`.
 ## Software Dependency Management
 - `torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable version to [PyPi](https://pypi.org/project/torch-npu)
  every 3 months, a development version (aka the POC version) every month, and a nightly version every day.
  The PyPi stable version **CAN** be used in vLLM Ascend final version, the monthly dev version **ONLY CANN** be used in
  vLLM Ascend RC version for rapid iteration, the nightly version **CANNOT** be used in vLLM Ascend any version and branches.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -0,0 +1,142 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
 # Copyright 2023 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm-project/vllm/docs/source/conf.py
 #
 # -- Path setup --------------------------------------------------------------
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import json
 import os
 # import sys
 # sys.path.insert(0, os.path.abspath('.'))
 # -- Project information -----------------------------------------------------
 project = 'vllm-ascend'
 copyright = '2025, vllm-ascend team'
 author = 'the vllm-ascend team'
 # The full version, including alpha/beta/rc tags
 release = ''
 # -- General configuration ---------------------------------------------------
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 # Copy from https://github.com/vllm-project/vllm/blob/main/docs/source/conf.py
 extensions = [
    "sphinx.ext.napoleon",
    "sphinx.ext.intersphinx",
    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "myst_parser",
    "sphinxarg.ext",
    "sphinx_design",
    "sphinx_togglebutton",
    "sphinx_substitution_extensions",
 ]
 myst_enable_extensions = ["colon_fence", "substitution"]
 # Change this when cut down release
 myst_substitutions = {
    # the branch of vllm, used in vllm clone
    # - main branch: 'main'
    # - vX.Y.Z branch: 'vX.Y.Z'
    'vllm_version': 'v0.10.1.1',
    # the branch of vllm-ascend, used in vllm-ascend clone and image tag
    # - main branch: 'main'
    # - vX.Y.Z branch: latest vllm-ascend release tag
    'vllm_ascend_version': 'v0.10.1rc1',
    # the newest release version of vllm-ascend and matched vLLM, used in pip install.
    # This value should be updated when cut down release.
    'pip_vllm_ascend_version': "0.10.1rc1",
    'pip_vllm_version': "0.10.1.1",
    # CANN image tag
    'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
    # vllm version in ci
    'ci_vllm_version': 'v0.10.1.1',
 }
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
 locale_dirs = ['locale/']   
 gettext_compact = False   
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
 exclude_patterns = [
    '_build',
    'Thumbs.db',
    '.DS_Store',
    '.venv',
    'README.md',
    'user_guide/release.template.md',
    # TODO(yikun): Remove this after zh supported
    '**/*.zh.md'
 ]
 # -- Options for HTML output -------------------------------------------------
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 html_title = project
 html_theme = 'sphinx_book_theme'
 html_logo = 'logos/vllm-ascend-logo-text-light.png'
 html_theme_options = {
    'path_to_docs': 'docs/source',
    'repository_url': 'https://github.com/vllm-project/vllm-ascend',
    'use_repository_button': True,
    'use_edit_page_button': True,
 }
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 # html_static_path = ['_static']
 READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
 if READTHEDOCS_VERSION_TYPE == "tag":
    # remove the warning banner if the version is a tagged release
    header_file = os.path.join(os.path.dirname(__file__),
                               "_templates/sections/header.html")
    # The file might be removed already if the build is triggered multiple times
    # (readthedocs build both HTML and PDF versions separately)
    if os.path.exists(header_file):
        os.remove(header_file)
 def setup(app):
    pass
 if __name__ == "__main__":
    print(json.dumps(myst_substitutions))
--- a/docs/source/developer_guide/contribution/index.md
+++ b/docs/source/developer_guide/contribution/index.md
@@ -0,0 +1,111 @@
 # Contributing
 ## Building and testing
 It's recommended to set up a local development environment to build and test
 before you submit a PR.
 ### Setup development environment
 Theoretically, the vllm-ascend build is only supported on Linux because
 `vllm-ascend` dependency `torch_npu` only supports Linux.
 But you can still set up dev env on Linux/Windows/macOS for linting and basic
 test as following commands:
 #### Run lint locally
 ```bash
 # Choose a base dir (~/vllm-project/) and set up venv
 cd ~/vllm-project/
 python3 -m venv .venv
 source ./.venv/bin/activate
 # Clone vllm-ascend and install
 git clone https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
 # Install lint requirement and enable pre-commit hook
 pip install -r requirements-lint.txt
 # Run lint (You need install pre-commits deps via proxy network at first time)
 bash format.sh
 ```
 #### Run CI locally
 After complete "Run lint" setup, you can run CI locally:
 ```{code-block} bash
   :substitutions:
 cd ~/vllm-project/
 # Run CI need vLLM installed
 git clone --branch |vllm_version| https://github.com/vllm-project/vllm.git
 cd vllm
 pip install -r requirements/build.txt
 VLLM_TARGET_DEVICE="empty" pip install .
 cd ..
 # Install requirements
 cd vllm-ascend
 # For Linux:
 pip install -r requirements-dev.txt
 # For non Linux:
 cat requirements-dev.txt | grep -Ev '^#|^--|^$|^-r' | while read PACKAGE; do pip install "$PACKAGE"; done
 cat requirements.txt | grep -Ev '^#|^--|^$|^-r' | while read PACKAGE; do pip install "$PACKAGE"; done
 # Run ci:
 bash format.sh ci
 ```
 #### Submit the commit
 ```bash
 # Commit changed files using `-s`
 git commit -sm "your commit info"
 ```
 🎉 Congratulations! You have completed the development environment setup.
 ### Test locally
 You can refer to [Testing](./testing.md) doc to help you setup testing environment and running tests locally.
 ## DCO and Signed-off-by
 When contributing changes to this project, you must agree to the DCO. Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO.
 Using `-s` with `git commit` will automatically add this header.
 ## PR Title and Classification
 Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:
 - `[Attention]` for new features or optimization in attention.
 - `[Communicator]` for new features or optimization in communicators.
 - `[ModelRunner]` for new features or optimization in model runner.
 - `[Platform]` for new features or optimization in platform.
 - `[Worker]` for new features or optimization in worker.
 - `[Core]` for new features or optimization  in the core vllm-ascend logic (such as platform, attention, communicators, model runner)
 - `[Kernel]` changes affecting compute kernels and ops.
 - `[Bugfix]` for bug fixes.
 - `[Doc]` for documentation fixes and improvements.
 - `[Test]` for tests (such as unit tests).
 - `[CI]` for build or continuous integration improvements.
 - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly.
 :::{note}
 If the PR spans more than one category, please include all relevant prefixes.
 :::
 ## Others
 You may find more information about contributing to vLLM Ascend backend plugin on [<u>docs.vllm.ai</u>](https://docs.vllm.ai/en/latest/contributing/overview.html).
 If you find any problem when contributing, you can feel free to submit a PR to improve the doc to help other developers.
 :::{toctree}
 :caption: Index
 :maxdepth: 1
 testing
 :::
--- a/docs/source/developer_guide/contribution/testing.md
+++ b/docs/source/developer_guide/contribution/testing.md
@@ -0,0 +1,285 @@
 # Testing
 This secition explains how to write e2e tests and unit tests to verify the implementation of your feature.
 ## Setup test environment
 The fastest way to setup test environment is to use the main branch container image:
 :::::{tab-set}
 :sync-group: e2e
 ::::{tab-item} Local (CPU)
 :selected:
 :sync: cpu
 You can run the unit tests on CPU with the following steps:
 ```{code-block} bash
   :substitutions:
 cd ~/vllm-project/
 # ls
 # vllm  vllm-ascend
 # Use mirror to speedup download
 # docker pull quay.nju.edu.cn/ascend/cann:|cann_image_tag|
 export IMAGE=quay.io/ascend/cann:|cann_image_tag|
 docker run --rm --name vllm-ascend-ut \
    -v $(pwd):/vllm-project \
    -v ~/.cache:/root/.cache \
    -ti $IMAGE bash
 # (Optional) Configure mirror to speedup download
 sed -i 's|ports.ubuntu.com|mirrors.huaweicloud.com|g' /etc/apt/sources.list
 pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple/
 # For torch-npu dev version or x86 machine
 export PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi"
 apt-get update -y
 apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
 # Install vllm
 cd /vllm-project/vllm
 VLLM_TARGET_DEVICE=empty python3 -m pip -v install .
 # Install vllm-ascend
 cd /vllm-project/vllm-ascend
 # [IMPORTANT] Import LD_LIBRARY_PATH to enumerate the CANN environment under CPU
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -m)-linux/devlib
 python3 -m pip install -r requirements-dev.txt
 python3 -m pip install -v .
 ```
 ::::
 ::::{tab-item} Single card
 :sync: single
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci0
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:main
 docker run --rm \
    --name vllm-ascend \
    --device $DEVICE \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v /root/.cache:/root/.cache \
    -p 8000:8000 \
    -it $IMAGE bash
 ```
 After starting the container, you should install the required packages:
 ```bash
 # Prepare
 pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 # Install required packages
 pip install -r requirements-dev.txt
 ```
 ::::
 ::::{tab-item} Multi cards
 :sync: multi
 ```{code-block} bash
   :substitutions:
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:main
 docker run --rm \
    --name vllm-ascend \
    --device /dev/davinci0 \
    --device /dev/davinci1 \
    --device /dev/davinci2 \
    --device /dev/davinci3 \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v /root/.cache:/root/.cache \
    -p 8000:8000 \
    -it $IMAGE bash
 ```
 After starting the container, you should install the required packages:
 ```bash
 cd /vllm-workspace/vllm-ascend/
 # Prepare
 pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 # Install required packages
 pip install -r requirements-dev.txt
 ```
 ::::
 :::::
 ## Running tests
 ### Unit test
 There are several principles to follow when writing unit tests:
 - The test file path should be consistent with source file and start with `test_` prefix, such as: `vllm_ascend/worker/worker_v1.py` --> `tests/ut/worker/test_worker_v1.py`
 - The vLLM Ascend test are using unittest framework, see [here](https://docs.python.org/3/library/unittest.html#module-unittest) to understand how to write unit tests.
 - All unit tests can be run on CPU, so you must mock the device-related function to host.
 - Example: [tests/ut/test_ascend_config.py](https://github.com/vllm-project/vllm-ascend/blob/main/tests/ut/test_ascend_config.py).
 - You can run the unit tests using `pytest`:
 :::::{tab-set}
 :sync-group: e2e
 ::::{tab-item} Local (CPU)
 :selected:
 :sync: cpu
 ```bash
 # Run unit tests
 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/$(uname -m)-linux/devlib
 TORCH_DEVICE_BACKEND_AUTOLOAD=0 pytest -sv tests/ut
 ```
 ::::
 ::::{tab-item} Single card
 :sync: single
 ```bash
 cd /vllm-workspace/vllm-ascend/
 # Run all single card the tests
 pytest -sv tests/ut
 # Run single test
 pytest -sv tests/ut/test_ascend_config.py
 ```
 ::::
 ::::{tab-item} Multi cards test
 :sync: multi
 ```bash
 cd /vllm-workspace/vllm-ascend/
 # Run all single card the tests
 pytest -sv tests/ut
 # Run single test
 pytest -sv tests/ut/test_ascend_config.py
 ```
 ::::
 :::::
 ### E2E test
 Although vllm-ascend CI provide [e2e test](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml) on Ascend CI, you can run it
 locally.
 :::::{tab-set}
 :sync-group: e2e
 ::::{tab-item} Local (CPU)
 :sync: cpu
 You can't run e2e test on CPU.
 ::::
 ::::{tab-item} Single card
 :selected:
 :sync: single
 ```bash
 cd /vllm-workspace/vllm-ascend/
 # Run all single card the tests
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/
 # Run a certain test script
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py
 # Run a certain case in test script
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/singlecard/test_offline_inference.py::test_models
 ```
 ::::
 ::::{tab-item} Multi cards test
 :sync: multi
 ```bash
 cd /vllm-workspace/vllm-ascend/
 # Run all single card the tests
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/
 # Run a certain test script
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/test_dynamic_npugraph_batchsize.py
 # Run a certain case in test script
 VLLM_USE_MODELSCOPE=true pytest -sv tests/e2e/multicard/test_offline_inference.py::test_models
 ```
 ::::
 :::::
 This will reproduce e2e test: [vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml).
 #### E2E test example:
 - Offline test example: [`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)
 - Online test examples: [`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)
 - Correctness test example: [`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)
 - Reduced Layer model test example: [test_torchair_graph_mode.py - DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)
    The CI resource is limited, you might need to reduce layer number of the model, below is an example of how to generate a reduced layer model:
    1. Fork the original model repo in modelscope, we need all the files in the repo except for weights.
    2. Set `num_hidden_layers` to the expected number of layers, e.g., `{"num_hidden_layers": 2,}`
    3. Copy the following python script as `generate_random_weight.py`. Set the relevant parameters `MODEL_LOCAL_PATH`, `DIST_DTYPE` and `DIST_MODEL_PATH` as needed:
        ```python
        import torch
        from transformers import AutoTokenizer, AutoConfig
        from modeling_deepseek import DeepseekV3ForCausalLM
        from modelscope import snapshot_download
        MODEL_LOCAL_PATH = "~/.cache/modelscope/models/vllm-ascend/DeepSeek-V3-Pruning"
        DIST_DTYPE = torch.bfloat16
        DIST_MODEL_PATH = "./random_deepseek_v3_with_2_hidden_layer"
        config = AutoConfig.from_pretrained(MODEL_LOCAL_PATH, trust_remote_code=True)
        model = DeepseekV3ForCausalLM(config)
        model = model.to(DIST_DTYPE)
        model.save_pretrained(DIST_MODEL_PATH)
        ```
 ### Run doctest
 vllm-ascend provides a `vllm-ascend/tests/e2e/run_doctests.sh` command to run all doctests in the doc files.
 The doctest is a good way to make sure the docs are up to date and the examples are executable, you can run it locally as follows:
 ```bash
 # Run doctest
 /vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh
 ```
 This will reproduce the same environment as the CI: [vllm_ascend_doctest.yaml](https://github.com/vllm-project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml).
--- a/docs/source/developer_guide/evaluation/accuracy_report/index.md
+++ b/docs/source/developer_guide/evaluation/accuracy_report/index.md
@@ -0,0 +1,6 @@
 # Accuracy Report
 :::{toctree}
 :caption: Accuracy Report
 :maxdepth: 1
 :::
--- a/docs/source/developer_guide/evaluation/index.md
+++ b/docs/source/developer_guide/evaluation/index.md
@@ -0,0 +1,10 @@
 # Accuracy
 :::{toctree}
 :caption: Accuracy
 :maxdepth: 1
 using_evalscope
 using_lm_eval
 using_opencompass
 accuracy_report/index
 :::
--- a/docs/source/developer_guide/evaluation/using_evalscope.md
+++ b/docs/source/developer_guide/evaluation/using_evalscope.md
@@ -0,0 +1,175 @@
 # Using EvalScope
 This document will guide you have model inference stress testing and accuracy testing using [EvalScope](https://github.com/modelscope/evalscope).
 ## 1. Online serving
 You can run docker container to start the vLLM server on a single NPU:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
 --name vllm-ascend \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
 -it $IMAGE \
 vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240
 ```
 If your service start successfully, you can see the info shown below:
 ```
 INFO:     Started server process [6873]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 ```
 Once your server is started, you can query the model with input prompts in new terminal:
 ```
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-7B-Instruct",
        "prompt": "The future of AI is",
        "max_tokens": 7,
        "temperature": 0
    }'
 ```
 ## 2. Install EvalScope using pip
 You can install EvalScope by using:
 ```bash
 python3 -m venv .venv-evalscope
 source .venv-evalscope/bin/activate
 pip install gradio plotly evalscope
 ```
 ## 3. Run gsm8k accuracy test using EvalScope
 You can `evalscope eval` run gsm8k accuracy test:
 ```
 evalscope eval \
 --model Qwen/Qwen2.5-7B-Instruct \
 --api-url http://localhost:8000/v1 \
 --api-key EMPTY \
 --eval-type service \
 --datasets gsm8k \
 --limit 10
 ```
 After 1-2 mins, the output is as shown below:
 ```shell
 +---------------------+-----------+-----------------+----------+-------+---------+---------+
 | Model               | Dataset   | Metric          | Subset   |   Num |   Score | Cat.0   |
 +=====================+===========+=================+==========+=======+=========+=========+
 | Qwen2.5-7B-Instruct | gsm8k     | AverageAccuracy | main     |    10 |     0.8 | default |
 +---------------------+-----------+-----------------+----------+-------+---------+---------+
 ```
 See more detail in: [EvalScope doc - Model API Service Evaluation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-api-service-evaluation).
 ## 4. Run model inference stress testing using EvalScope
 ### Install EvalScope[perf] using pip
 ```shell
 pip install evalscope[perf] -U
 ```
 ### Basic usage
 You can use `evalscope perf` run perf test:
 ```
 evalscope perf \
    --url "http://localhost:8000/v1/chat/completions" \
    --parallel 5 \
    --model Qwen/Qwen2.5-7B-Instruct \
    --number 20 \
    --api openai \
    --dataset openqa \
    --stream
 ```
 ### Output results
 After 1-2 mins, the output is as shown below:
 ```shell
 Benchmarking summary:
 +-----------------------------------+---------------------------------------------------------------+
 | Key                               | Value                                                         |
 +===================================+===============================================================+
 | Time taken for tests (s)          | 38.3744                                                       |
 +-----------------------------------+---------------------------------------------------------------+
 | Number of concurrency             | 5                                                             |
 +-----------------------------------+---------------------------------------------------------------+
 | Total requests                    | 20                                                            |
 +-----------------------------------+---------------------------------------------------------------+
 | Succeed requests                  | 20                                                            |
 +-----------------------------------+---------------------------------------------------------------+
 | Failed requests                   | 0                                                             |
 +-----------------------------------+---------------------------------------------------------------+
 | Output token throughput (tok/s)   | 132.6926                                                      |
 +-----------------------------------+---------------------------------------------------------------+
 | Total token throughput (tok/s)    | 158.8819                                                      |
 +-----------------------------------+---------------------------------------------------------------+
 | Request throughput (req/s)        | 0.5212                                                        |
 +-----------------------------------+---------------------------------------------------------------+
 | Average latency (s)               | 8.3612                                                        |
 +-----------------------------------+---------------------------------------------------------------+
 | Average time to first token (s)   | 0.1035                                                        |
 +-----------------------------------+---------------------------------------------------------------+
 | Average time per output token (s) | 0.0329                                                        |
 +-----------------------------------+---------------------------------------------------------------+
 | Average input tokens per request  | 50.25                                                         |
 +-----------------------------------+---------------------------------------------------------------+
 | Average output tokens per request | 254.6                                                         |
 +-----------------------------------+---------------------------------------------------------------+
 | Average package latency (s)       | 0.0324                                                        |
 +-----------------------------------+---------------------------------------------------------------+
 | Average package per request       | 254.6                                                         |
 +-----------------------------------+---------------------------------------------------------------+
 | Expected number of requests       | 20                                                            |
 +-----------------------------------+---------------------------------------------------------------+
 | Result DB path                    | outputs/20250423_002442/Qwen2.5-7B-Instruct/benchmark_data.db |
 +-----------------------------------+---------------------------------------------------------------+
 Percentile results:
 +------------+----------+---------+-------------+--------------+---------------+----------------------+
 | Percentile | TTFT (s) | ITL (s) | Latency (s) | Input tokens | Output tokens | Throughput(tokens/s) |
 +------------+----------+---------+-------------+--------------+---------------+----------------------+
 |    10%     |  0.0962  |  0.031  |   4.4571    |      42      |      135      |       29.9767        |
 |    25%     |  0.0971  | 0.0318  |   6.3509    |      47      |      193      |       30.2157        |
 |    50%     |  0.0987  | 0.0321  |   9.3387    |      49      |      285      |       30.3969        |
 |    66%     |  0.1017  | 0.0324  |   9.8519    |      52      |      302      |       30.5182        |
 |    75%     |  0.107   | 0.0328  |   10.2391   |      55      |      313      |       30.6124        |
 |    80%     |  0.1221  | 0.0329  |   10.8257   |      58      |      330      |       30.6759        |
 |    90%     |  0.1245  | 0.0333  |   13.0472   |      62      |      404      |       30.9644        |
 |    95%     |  0.1247  | 0.0336  |   14.2936   |      66      |      432      |       31.6691        |
 |    98%     |  0.1247  | 0.0353  |   14.2936   |      66      |      432      |       31.6691        |
 |    99%     |  0.1247  | 0.0627  |   14.2936   |      66      |      432      |       31.6691        |
 +------------+----------+---------+-------------+--------------+---------------+----------------------+
 ```
 See more detail in: [EvalScope doc - Model Inference Stress Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-usage).
--- a/docs/source/developer_guide/evaluation/using_lm_eval.md
+++ b/docs/source/developer_guide/evaluation/using_lm_eval.md
@@ -0,0 +1,300 @@
 # Using lm-eval
 This document will guide you have a accuracy testing using [lm-eval][1].
 ## Online Server
 ### 1. start the vLLM server
 You can run docker container to start the vLLM server on a single NPU:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
 --name vllm-ascend \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
 -it $IMAGE \
 /bin/bash
 vllm serve Qwen/Qwen2.5-0.5B-Instruct --max_model_len 4096 &
 ```
 Started the vLLM server successfully,if you see log as below:
 ```
 INFO:     Started server process [9446]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 ```
 ### 2. Run gsm8k accuracy test using lm-eval
 You can query result with input prompts:
 ```
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-0.5B-Instruct",
        "prompt": "'"<|im_start|>system\nYou are a professional accountant. Answer questions using accounting knowledge, output only the option letter (A/B/C/D).<|im_end|>\n"\
 "<|im_start|>user\nQuestion: A company's balance sheet as of December 31, 2023 shows:\n"\
 "  Current assets: Cash and equivalents 5 million yuan, Accounts receivable 8 million yuan, Inventory 6 million yuan\n"\
 "  Non-current assets: Net fixed assets 12 million yuan\n"\
 "  Current liabilities: Short-term loans 4 million yuan, Accounts payable 3 million yuan\n"\
 "  Non-current liabilities: Long-term loans 9 million yuan\n"\
 "  Owner's equity: Paid-in capital 10 million yuan, Retained earnings ?\n"\
 "Requirement: Calculate the company's Asset-Liability Ratio and Current Ratio (round to two decimal places).\n"\
 "Options:\n"\
 "A. Asset-Liability Ratio=58.33%, Current Ratio=1.90\n"\
 "B. Asset-Liability Ratio=62.50%, Current Ratio=2.17\n"\
 "C. Asset-Liability Ratio=65.22%, Current Ratio=1.75\n"\
 "D. Asset-Liability Ratio=68.00%, Current Ratio=2.50<|im_end|>\n"\
 "<|im_start|>assistant\n"'",
        "max_tokens": 1,
        "temperature": 0,
        "stop": ["<|im_end|>"]
    }' | python3 -m json.tool
 ```
 The output format matches the following:
 ```
 {
    "id": "cmpl-2f678e8bdf5a4b209a3f2c1fa5832e25",
    "object": "text_completion",
    "created": 1754475138,
    "model": "Qwen/Qwen2.5-0.5B-Instruct",
    "choices": [
        {
            "index": 0,
            "text": "A",
            "logprobs": null,
            "finish_reason": "length",
            "stop_reason": null,
            "prompt_logprobs": null
        }
    ],
    "service_tier": null,
    "system_fingerprint": null,
    "usage": {
        "prompt_tokens": 252,
        "total_tokens": 253,
        "completion_tokens": 1,
        "prompt_tokens_details": null
    },
    "kv_transfer_params": null
 }
 ```
 Install lm-eval in the container.
 ```bash
 export HF_ENDPOINT="https://hf-mirror.com"
 pip install lm-eval[api]
 ```
 Run the following command:
 ```
 # Only test gsm8k dataset in this demo
 lm_eval \
  --model local-completions \
  --model_args model=Qwen/Qwen2.5-0.5B-Instruct,base_url=http://127.0.0.1:8000/v1/completions,tokenized_requests=False,trust_remote_code=True \
  --tasks gsm8k \
  --output_path ./
 ```
 After 30 mins, the output is as shown below:
 ```
 The markdown format results is as below:
 Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.3215|±  |0.0129|
 |     |       |strict-match    |     5|exact_match|↑  |0.2077|±  |0.0112|
 ```
 ## Offline Server
 ### 1. Run docker container
 You can run docker container on a single NPU:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
 --name vllm-ascend \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
 -it $IMAGE \
 /bin/bash
 ```
 ### 2. Run gsm8k accuracy test using lm-eval
 Install lm-eval in the container.
 ```bash
 export HF_ENDPOINT="https://hf-mirror.com"
 pip install lm-eval
 ```
 Run the following command:
 ```
 # Only test gsm8k dataset in this demo
 lm_eval \
  --model vllm \
  --model_args pretrained=Qwen/Qwen2.5-0.5B-Instruct,max_model_len=4096 \
  --tasks gsm8k \
  --batch_size auto
 ```
 After 1-2 mins, the output is as shown below:
 ```
 The markdown format results is as below:
 Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
 |-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
 |gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.3412|±  |0.0131|
 |     |       |strict-match    |     5|exact_match|↑  |0.3139|±  |0.0128|
 ```
 ## Use offline Datasets
 Take gsm8k(single dataset) and mmlu(multi-subject dataset) as examples, and you can see more from [here][2].
 ```bash
 # set HF_DATASETS_OFFLINE when using offline datasets
 export HF_DATASETS_OFFLINE=1
 git clone https://github.com/EleutherAI/lm-evaluation-harness.git
 cd lm-evaluation-harness
 pip install -e .
 # gsm8k yaml path
 cd lm_eval/tasks/gsm8k
 # mmlu yaml path
 cd lm_eval/tasks/mmlu/default
 ```
 set [gsm8k.yaml][3] as follows:
 ```yaml
 tag:
  - math_word_problems
 task: gsm8k
 # set dataset_path arrow or json or parquet according to the downloaded dataset
 dataset_path: arrow
 # set dataset_name to null
 dataset_name: null
 output_type: generate_until
 # add dataset_kwargs 
 dataset_kwargs:
  data_files:
    # train and test data download path
    train: /root/.cache/gsm8k/gsm8k-train.arrow
    test: /root/.cache/gsm8k/gsm8k-test.arrow
 training_split: train
 fewshot_split: train
 test_split: test
 doc_to_text: 'Q: {{question}}
  A(Please follow the summarize the result at the end with the format of "The answer is xxx", where xx is the result.):'
 doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
 metric_list:
  - metric: exact_match
    aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: false
    regexes_to_ignore:
      - ","
      - "\\$"
      - "(?s).*#### "
      - "\\.$"
 generation_kwargs:
  until:
    - "Question:"
    - "</s>"
    - "<|im_end|>"
  do_sample: false
  temperature: 0.0
 repeats: 1
 num_fewshot: 5
 filter_list:
  - name: "strict-match"
    filter:
      - function: "regex"
        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
      - function: "take_first"
  - name: "flexible-extract"
    filter:
      - function: "regex"
        group_select: -1
        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
      - function: "take_first"
 metadata:
  version: 3.0
 ```
 set [_default_template_yaml][4] as follows:
 ```yaml
 # set dataset_path according to the downloaded dataset
 dataset_path: /root/.cache/mmlu
 test_split: test
 fewshot_split: dev
 fewshot_config:
  sampler: first_n
 output_type: multiple_choice
 doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
 doc_to_choice: ["A", "B", "C", "D"]
 doc_to_target: answer
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
 metadata:
  version: 1.0
 dataset_kwargs:
  trust_remote_code: true
 ```
 You can see more usage on [Lm-eval Docs][5].
 [1]: https://github.com/EleutherAI/lm-evaluation-harness
 [2]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#using-local-datasets
 [3]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml
 [4]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/mmlu/default/_default_template_yaml
 [5]: https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/README.md
--- a/docs/source/developer_guide/evaluation/using_opencompass.md
+++ b/docs/source/developer_guide/evaluation/using_opencompass.md
@@ -0,0 +1,123 @@
 # Using OpenCompass
 This document will guide you have a accuracy testing using [OpenCompass](https://github.com/open-compass/opencompass).
 ## 1. Online Serving
 You can run docker container to start the vLLM server on a single NPU:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
 --name vllm-ascend \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
 -it $IMAGE \
 vllm serve Qwen/Qwen2.5-7B-Instruct --max_model_len 26240
 ```
 If your service start successfully, you can see the info shown below:
 ```
 INFO:     Started server process [6873]
 INFO:     Waiting for application startup.
 INFO:     Application startup complete.
 ```
 Once your server is started, you can query the model with input prompts in new terminal:
 ```
 curl http://localhost:8000/v1/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "Qwen/Qwen2.5-7B-Instruct",
        "prompt": "The future of AI is",
        "max_tokens": 7,
        "temperature": 0
    }'
 ```
 ## 2. Run ceval accuracy test using OpenCompass
 Install OpenCompass and configure the environment variables in the container.
 ```bash
 # Pin Python 3.10 due to:
 # https://github.com/open-compass/opencompass/issues/1976
 conda create -n opencompass python=3.10
 conda activate opencompass
 pip install opencompass modelscope[framework]
 export DATASET_SOURCE=ModelScope
 git clone https://github.com/open-compass/opencompass.git
 ```
 Add `opencompass/configs/eval_vllm_ascend_demo.py` with the following content:
 ```python
 from mmengine.config import read_base
 from opencompass.models import OpenAISDK
 with read_base():
    from opencompass.configs.datasets.ceval.ceval_gen import ceval_datasets
 # Only test ceval-computer_network dataset in this demo
 datasets = ceval_datasets[:1]
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ],
    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 models = [
    dict(
        abbr='Qwen2.5-7B-Instruct-vLLM-API',
        type=OpenAISDK,
        key='EMPTY', # API key
        openai_api_base='http://127.0.0.1:8000/v1', 
        path='Qwen/Qwen2.5-7B-Instruct', 
        tokenizer_path='Qwen/Qwen2.5-7B-Instruct', 
        rpm_verbose=True, 
        meta_template=api_meta_template,
        query_per_second=1, 
        max_out_len=1024, 
        max_seq_len=4096, 
        temperature=0.01, 
        batch_size=8,
        retry=3,
    )
 ]
 ```
 Run the following command:
 ```
 python3 run.py opencompass/configs/eval_vllm_ascend_demo.py --debug
 ```
 After 1-2 mins, the output is as shown below:
 ```
 The markdown format results is as below:
 | dataset | version | metric | mode | Qwen2.5-7B-Instruct-vLLM-API |
 |----- | ----- | ----- | ----- | -----|
 | ceval-computer_network | db9ce2 | accuracy | gen | 68.42 |
 ```
 You can see more usage on [OpenCompass Docs](https://opencompass.readthedocs.io/en/latest/index.html).
--- a/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md
+++ b/docs/source/developer_guide/feature_guide/ModelRunner_prepare_inputs.md
@@ -0,0 +1,237 @@
 # Purpose
 What information should we have in order to perform model forward pass?
 - the inputs
 - the corresponding attention metadata of the inputs
 The following diagram shows what we should prepare for the model inference.
 ```
              +---------------+
  inputs  --> |               |
              |     model     |  --> output
 attn_meta --> |               |
              +---------------+  
 ```
 Therefore, as long as we have these two pieces of information mentioned above, we can perform the model's forward propagation.
 This article will explain **how we obtain the inputs and their corresponding attention metadata** which are on the left part of above diagram.
 # Overview
 ## 1. Obtain inputs
 The workflow of obtain inputs:
 1. Get `token positions`: The relative position of each token within its request sequence.
 2. Get `token indices`: the index of each scheduled token in the token table.
 3. Get `Token IDs`: Using token indices to retrieve the Token IDs from **token id table**.
 At last, these `Token IDs` required to feed into the model, and also, `positions` should be send into model to create `Rope` (Rotary positional embedding). Both of them are the inputs of a model.
 **Note**: because the `Token IDs` is the inputs of the model, so we will call it `Inputs IDs`
 ## 2. Build inputs attention metadata
 The model requires these attention metadata during the forward pass:
 - `query start location`: represents the start and end location of each request corresponding to the scheduled tokens.
 - `sequence length`: the length of each request including both computed tokens and newly scheduled tokens.
 - `number of computed tokens`: the number of computed tokens for each request.
 - `number of requests`: the number of requests in this batch.
 - `number of tokens`: Total number of scheduled tokens in this batch.
 - **`block table`**: translates the logical address (within its sequence) of each block to its global physical address in the device's memory.
 - `max query len`: the longest scheduled tokens length in this requests batch.
 - `slot mapping`: the indices of each token that input token will be stored into.
 - `attention mask`: The mask matrix applied to attention scores before softmax to control which tokens can attend to each other. (usually a causal attention)
 # Before start
 There are mainly three types of variables.
 - token level: represents one attribute corresponding to each scheduled token, so the length of this variable is the number of scheduled tokens
 - request level: represents one attribute of each scheduled request, which length usually is the number of scheduled requests. (`query start location` is a special case, which has one more element)
 - system level:
  1. **Token IDs table**: store the token ids (i.e. the inputs of the model) of each request. The shape of this table is `(max num request, max model len)`. Here, `max num request` is maximum count of concurrent requests allowed in a forward batch and `max model len` is the max token count can be handled at one request sequence in this model.
  2. **Block table**: translates the logical address (within its sequence) of each block to its global physical address in the device's memory. The shape of this table is `(max num request, max model len / block size)`
 **Note**: How were these two tables formed?
 - Both of them are come from the `_update_states` method before **prepare inputs**. You can take a look if you need more inspiration.
 ## Tips
 What is `Token ID`?
 For simple, a `token ID` is an **integer** (usually `int32`), which represents a token.
 example of `Token ID`:
 ```
 | Token ID     | Token         | 
 |--------------|---------------|
 | 0            | [PAD]         |
 | 1            | <|endoftext|> |
 | 2            | <|start|>     |
 | 3            | [SEP]         |
 | 4            | I             |
 | 5            | the           |
 | 6            | be            |
 | 7            | of            |
 | 8            | and           |     
 | ...          | ...           |     
 | ...          | ...           |
 | vocab_size-1 | <|im_end|>    |
 ```
 # Go through details
 Make a simple example, assumption:
 - max tokens can be scheduled at once: 10.
 - `block size`: 2
 - Totally schedule 3 requests. Their prompt lengths are 3, 2, and 8 respectively.
 - `max model length`: 12 (the max token count can be handled at one request sequence in this model).
 These assumption are configured in the beginning when starting the vllm. They are not fixed, so you can manually set them.
 ## Step 1: All requests in the prefill phase
 ### Obtain inputs
 Due to the max schedule token count limitation is 10, The scheduled token of each request: `{'0': 3, '1': 2, '2': 5}`. Note that the `request_2` is in chunked prefill, still has 3 prompt tokens not be scheduled.
 #### 1. Get token positions:
 First, find out each token belong to which request: the 0~2 tokens belong to request_0, 3~4 tokens belong to request_1 and 5~9 tokens belong to request_2. So, we can use `request indices` to point out each token belongs to which request. `request indices`: `[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]`
 For each request, use **the number of tokens already computed** + **the relative position in current scheduled tokens**: `request_0: [0 + 0, 0 + 1, 0 + 2]`, `request_1: [0 + 0, 0 + 1]`, `request_2: [0 + 0, 0 + 1,..., 0 + 4]` and then concat them together: `[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`. Note: there is more efficient way (using `request indices`) to create positions in actual code.
 Finally, `token opsitions` is `[0, 1, 2, 0, 1, 0, 1, 2, 3, 4]`. This variable is **token level**
 #### 2. Get token indices:
 Current **Token IDs table**, which shape is `(max num request, max model len)`.
 Why these `T_3_5`, `T_3_6`, `T_3_7` are in this table even them are not scheduled this time?
 - We will fill all Token IDs in one request sequence to this table at once, but we only retrieve the tokens we scheduled this time. Then we will retrieve the remain Token IDs next time.
 ```
 | T_0_0 | T_0_1 | T_0_2 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 | T_1_0 | T_1_1 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 | T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 |   ?   |   ?   |   ?   |   ?   |
 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 ......
 ......
 ......
 ```
 Note that the `T_x_x` is an `int32`
 Let's say `M = max model len`, Then we can use `token positions` together with the `request indices` of each token to construct `token indices`.
 So `token indices` = `[0 + 0 * M, 1 + 0 * M, 2 + 0 * M, 0 + 1 * M, 1 + 1 * M, 0 + 2 * M, 1 + 2 * M, 2 + 2 * M, 3 + 2 * M, 4 + 2 * M]` = `[0, 1, 2, 12, 13, 24, 25, 26, 27, 28]`
 #### 3. Retrieve the Token IDs
 As mentioned before, we will refer to these `Token IDs` as `Input IDs`.
 We use the `token indices` to select out the corresponding `Input IDs` from the token table, The Pseudocode like:
 ```
 input_ids = token_table[token_indices]
 ```
 As mentioned before, we will refer these Token IDs as Inputs IDs:
 - `Input IDs` = `[T_0_0, T_0_1, T_0_2, T_1_0, T_1_1, T_2_0, T_2_1, T_3_2, T_3_3, T_3_4]`
 ### Build inputs attention metadata
 Current **Block Table**, we use the first block (i.e. block_0) to mark the unused block. The shape of the block is `(max num request, max model len / block size)`, the `max model len / block size = 12 / 2 = 6`
 ```
 | 1  | 2  | 0  | 0  | 0  | 0  |
 | 3  | 0  | 0  | 0  | 0  | 0  |
 | 4  | 5  | 6  | 0  | 0  | 0  |
 | 0  | 0  | 0  | 0  | 0  | 0  |
 ......
 ......
 ......
 ```
 The kv cache block in the device memory is like:
 ```
 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... 
 ```
 Let's say `K = max model len / block size = 6`, we can get token `device block number` from
 The workflow of achieving slot mapping:
 1. get `block table indices` using `K`, `positions` and `request indices`. Purpose: For each token, it could be used to select the `device block number` from `block table`.
 2. get `device block number` using `block table indices`. Purpose: `device block number` indicates each token belong to which device block.
 3. get `block offsets` using `positions` and `block size`. Purpose: `block offsets` indicates the offsets of each token within a block.
 4. construct `slot mapping` using `device block number` and `block offsets`. Purpose: we can use `slot mapping` to store the Token IDs into token slots.
 Details:
 1. Using a simple formula to calculate the `block table indices`: `request indices * K + positions / block size`. So it equal to `[0 * 6 + 0 / 2, 0 * 6 + 1 / 2, 0 * 6 + 2 / 2, 1 * 6 + 0 / 2, 1 * 6 + 1 / 2, 2 * 6 + 0 / 2, 2 * 6 + 1 / 2, 2 * 6 + 2 / 2, 2 * 6 + 3 / 2, 2 * 6 + 4 / 2] = [0, 0, 1, 6, 6, 12, 12, 13, 13, 14]`. This could be used to select the `device block number` from `block table`. **token level**
 2. Using the `block table indices` to select out the `device block number` for each scheduled token. The Pseudocode like: `block_numbers = block_table[block_table_indices]`. So `device block number =  [1, 1, 2, 3, 3, 4, 4, 5, 5, 6]`**token level**
 3. `block offsets` could be computed by `block offsets = positions % block size = [0, 1, 0, 0, 1, 0, 1, 0, 1, 0]`. **token level**
 4. At last, use `block offsets` and `device block number` to create `slot mapping`: `device block number * block size + block_offsets = [2, 3, 4, 6, 7, 8, 9, 10, 11, 12]`
 First, we know the scheduled token count is `[3, 2, 5]` **request level**
 - So, we can use prefix sum to calculate the `query start location`: `[0, 3, 5, 10]`. **request level**
 - Because in step_1 all the tokens in prefill, computed tokens count is 0, then `sequence length` = `[3, 2, 5]`. **request level**
 - As mentioned above, `number of computed tokens` are all 0: `[0, 0, 0]`. **request level**
 - `number of requests`: `3`.
 - `number of tokens`: `[3, 2, 5]`. **request level**
 - `max query len`: `5`.
 - `slot mapping`: `[2, 3, 4, 6, 7, 8, 9, 10, 11, 12]`. **token level**
 - `attention mask`: For all request do prefill, we simply create only one mask matrix for reuse across different requests. The shape of this mask matrix is `5 * 5`:
 ## Step 2: Chunked prefill
 In Step 2, we will no longer provide explanations or perform calculations; instead, we will directly present the final result.
 ### Obtain inputs
 The scheduled token of each request: `{'0': 1, '1': 1, '2': 3}`.
 1. `request indices`: `[0, 1, 2, 2, 2]`
 2. `token positions`: `[3, 2, 5, 6, 7]`
 Current **Token IDs table**:
 ```
 | T_0_0 | T_0_1 | T_0_2 | T_0_3 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 | T_1_0 | T_1_1 | T_1_2 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 | T_2_0 | T_2_1 | T_3_2 | T_3_3 | T_3_4 | T_3_5 | T_3_6 | T_3_7 |   ?   |   ?   |   ?   |   ?   |
 |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |   ?   |
 ......
 ......
 ......
 ```
 **Note**: The **T_0_3**, **T_1_2** are new Token IDs of request_0, request_1 respectively. them are sampled from the output of the model.
 3. `token indices`: `[3, 14, 29, 30, 31]`
 4. `Input IDs`: `[T_0_3, T_1_2, T_3_5, T_3_6, T_3_7]`
 ### Build inputs attention metadata
 Current **Block Table**. **Note**: We allocate the `7` and `8` block to `request_1` and `request_2` respectively. Because they need more space in device to store kv cache after generate new tokens or chunked prefill new tokens.
 ```
 | 1  | 2  | 0  | 0  | 0  | 0  |
 | 3  | 7  | 0  | 0  | 0  | 0  |
 | 4  | 5  | 6  | 8  | 0  | 0  |
 | 0  | 0  | 0  | 0  | 0  | 0  |
 ......
 ......
 ......
 ```
 The kv cache block in the device memory is still like:
 ```
 | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ...... 
 ```
 1. `block table indices`: `[1, 7, 14, 15, 15]`. **token level**
 2. `device block number`: `[2, 7, 6, 8, 8]`. **token level**
 3. `block offsets`: `[1, 0, 1, 0, 1]` **token level**
 4. `slot mapping`: `[5, 14, 13, 16, 17]` **token level**
 scheduled token count is `[1, 1, 3]`
 - `query start location`: `[0, 1, 2, 5]`
 - `sequence length`: `[4, 3, 8]`
 - `number of computed tokens`: `[3, 2, 5]`
 - `number of requests`: `3`
 - `max query len`: `3`
 - `slot mapping`: `[5, 14, 13, 16, 17]`
 - `attention mask`: `5 * 8` Each token will have a `1 * 8` vector, and there are 5 scheduled tokens.
 # At last
 If you under stand the step_1 and step_2, you will know the all following steps.
 Hope this article can help you get better understand to how vllm prepare inputs for model forwarding. If you have any good idea, welcome to contribute to us.
--- a/docs/source/developer_guide/feature_guide/index.md
+++ b/docs/source/developer_guide/feature_guide/index.md
@@ -0,0 +1,10 @@
 # Feature Guide
 This section provides an overview of the features implemented in vLLM Ascend. Developers can refer to this guide to understand how vLLM Ascend works.
 :::{toctree}
 :caption: Feature Guide
 :maxdepth: 1
 patch
 ModelRunner_prepare_inputs
 :::
--- a/docs/source/developer_guide/feature_guide/patch.md
+++ b/docs/source/developer_guide/feature_guide/patch.md
@@ -0,0 +1,85 @@
 # Patch in vLLM Ascend
 vLLM Ascend is a platform plugin for vLLM. Due to the release cycle of vLLM and vLLM Ascend is different, and the hardware limitation in some case, we need to patch some code in vLLM to make it compatible with vLLM Ascend.
 In vLLM Ascend code, we provide a patch module `vllm_ascend/patch` to address the change for vLLM.
 ## Principle
 We should keep in mind that Patch is not the best way to make vLLM Ascend compatible. It's just a temporary solution. The best way is to contribute the change to vLLM to make it compatible with vLLM Ascend originally. In vLLM Ascend, we have the basic principle for Patch strategy:
 1. Less is more. Please do not patch unless it's the only way currently.
 2. Once a patch is added, it's required to describe the future plan for removing the patch.
 3. Anytime, clean the patch code is welcome.
 ## How it works
 In `vllm_ascend/patch`, you can see the code structure as follows:
 ```
 vllm_ascend
 ├── patch
 │   ├── platform
 │   │   ├── patch_0_9_2
 │   │   ├── patch_common
 │   │   ├── patch_main
 │   ├── worker
 │   │   ├── patch_0_9_2
 │   │   ├── patch_common
 │   │   ├── patch_main
 └───────────
 ```
 - **platform**: The patch code in this directory is for patching the code in vLLM main process. It's called by `vllm_ascend/platform::NPUPlatform::pre_register_and_update` very early when vLLM is initialized.
  - For online mode, vLLM process calls the platform patch here `vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing the cli args.
  - For offline mode, vLLM process calls the platform patch here `vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when parsing the input parameters.
 - **worker**: The patch code in this directory is for patching the code in vLLM worker process. It's called by `vllm_ascend/worker/worker_v1::NPUWorker::__init__` when the vLLM worker process is initialized.
  - For both online and offline mode, vLLM engine core process calls the worker patch here `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` when initializing the worker process.
 In both **platform** and **worker** folder, there are several patch modules. They are used for patching different version of vLLM.
 - `patch_0_10_0`: This module is used for patching vLLM 0.10.0. The version is always the nearest version of vLLM. Once vLLM is released, we will drop this patch module and bump to a new version. For example, `patch_0_10_0` is used for patching vLLM 0.10.0.
 - `patch_main`: This module is used for patching the code in vLLM main branch.
 - `patch_common`: This module is used for patching both vLLM 0.10.0 and vLLM main branch.
 ## How to write a patch
 Before writing a patch, following the principle above, we should patch the least code. If it's necessary, we can patch the code in either **platform** and **worker** folder. Here is an example to patch `distributed` module in vLLM.
 1. Decide which version of vLLM we should patch. For example, after analysis, here we want to patch both 0.10.0 and main of vLLM.
 2. Decide which process we should patch. For example, here `distributed` belongs to the vLLM main process, so we should patch `platform`.
 3. Create the patch file in the right folder. The file should be named as `patch_{module_name}.py`. The example here is `vllm_ascend/patch/platform/patch_common/patch_distributed.py`.
 4. Write your patch code in the new file. Here is an example:
    ```python
    import vllm
    def patch_destroy_model_parallel():
        # your patch code
        ...
    vllm.distributed.parallel_state.destroy_model_parallel = patch_destroy_model_parallel
    ```
 5. Import the patch file in `__init__.py`. In this example, add `import vllm_ascend.patch.platform.patch_common.patch_distributed` into `vllm_ascend/patch/platform/patch_common/__init__.py`.
 6. Add the description of the patch in `vllm_ascend/patch/__init__.py`. The description format is as follows:
    ```
    # ** File: <The patch file name> **
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #   1. `<The target patch module in vLLM>`
    #    Why:
    #       <Describe the reason why we need to patch>
    #    How：
    #       <Describe the way to patch>
    #    Related PR (if no, explain why):
    #       <Add a link to the related PR in vLLM. If there is no related PR, explain why>
    #    Future Plan:
    #       <Describe the future plan to remove the patch>
    ```
 7. Add the Unit Test and E2E Test. Any newly added code in vLLM Ascend should contain the Unit Test and E2E Test as well. You can find more details in [test guide](../contribution/testing.md)
 ## Limitation
 1. In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore process and Worker process. Now vLLM Ascend only support patch the code in Main process and Worker process by default. If you want to patch the code runs in EngineCore process, you should patch EngineCore process entirely during setup, the entry code is here `vllm.v1.engine.core`. Please override `EngineCoreProc` and `DPEngineCoreProc` entirely.
 2. If you are running an edited vLLM code, the version of the vLLM may be changed automatically. For example, if you runs an edited vLLM based on v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM Ascend can't distinguish the version of vLLM you're using. In this case, you can set the environment variable `VLLM_VERSION` to specify the version of vLLM you're using, then the patch for v0.10.0 should work.
--- a/docs/source/developer_guide/modeling/adding_a_new_model.md
+++ b/docs/source/developer_guide/modeling/adding_a_new_model.md
@@ -0,0 +1,259 @@
 # Adding a New Model
 This guide demonstrates how to integrate a novel or customized model into vllm-ascend. For foundational concepts, it is highly recommended to refer to
 [vllm official doc: Adding a New Model](https://docs.vllm.ai/en/stable/contributing/model/) first.
 ## Step 1: Implementing Models with `torch` and `torch_npu`
 This section provides instructions for implementing new models compatible with vllm and vllm-ascend.
 **Before starting:**
 - Verify whether your model already exists in vllm's [models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory.
 - Use existing models' implementation as templates to accelerate your development.
 ### Method 1: Implementing New Models from Scratch
 Follow vllm's [OPT model adaptation](https://docs.vllm.ai/en/stable/contributing/model/basic.html) example for guidance.
 **Key implementation requirements:**
 1. Place model files in `vllm_ascend/models/` directory.
 2. Standard module structure for decoder-only LLMs (please checkout vllm's implementations for other kinds of model):
 - `*ModelForCausalLM` (top-level wrapper)
 - `*Model` (main architecture)
 - `*DecoderLayer` (transformer block)
 - `*Attention` and `*MLP` (specific computation unit)
 :::{note}
 `*` denotes your model's unique identifier.
 :::
 3. Critical Implementation Details:
 All modules must include a `prefix` argument in `__init__()`.
 **Required interfaces:**
 | Module Type          | Required Methods                          |
 | :------------------- | :---------------------------------------- |
 | `*ModelForCausalLM`  | `get_input_embeddings`, `compute_logits`, `load_weights` |
 | `*Model`             | `get_input_embeddings`, `load_weights`    |
 4. Attention Backend Integration:
 Importing attention via `from vllm.attention import Attention` can automatically leverage the attention backend routing of vllm-ascend (see: `get_attn_backend_cls()` in `vllm_ascend/platform.py`).
 5. Tensor Parallelism:
 Use vllm's parallel layers (`ColumnParallelLinear`, `VocabParallelEmbedding`, etc.) to implement models supporting tensor parallelism. Note that Ascend-specific customizations are implemented in `vllm_ascend/ops/` directory (RMSNorm, VocabParallelEmbedding, etc.).
 **Reference Implementation Template** (assumed path: `vllm_ascend/models/custom_model.py`):
 ```python
 from collections.abc import Iterable
 from typing import Optional, Union
 import torch
 from torch import nn
 from vllm.attention import Attention
 from vllm.config import VllmConfig
 from vllm.sequence import IntermediateTensors
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 class CustomAttention(nn.Module):
    def __init__(self, vllm_config: VllmConfig, prefix: str):
        super().__init__()
        self.attn = Attention(prefix=f"{prefix}.attn")
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # Implement attention logic
        ...
 class CustomDecoderLayer(nn.Module):
    def __init__(self, vllm_config: VllmConfig, prefix: str):
        super().__init__()
        self.self_attn = CustomAttention(vllm_config, prefix=f"{prefix}.self_attn")
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # Implement decoder layer
        ...
 class CustomModel(nn.Module):
    def __init__(self, vllm_config: VllmConfig, prefix: str):
        super().__init__()
        self.layers = nn.ModuleList([
            CustomDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") 
            for i in range(vllm_config.model_config.hf_config.num_hidden_layers)
        ])
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        ...
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        ...
    def load_weights(self, 
                    weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        ...
 class CustomModelForCausalLM(nn.Module):
    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        self.model = CustomModel(vllm_config, prefix=f"{prefix}.model")
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        ...
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        ...
    def compute_logits(self,
                      hidden_states: torch.Tensor,
                      sampling_metadata: SamplingMetadata) -> torch.Tensor:
        ...
    def load_weights(self, 
                    weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        ...
 ```
 ### Method 2: Customizing Existing vLLM Models
 For most use cases, extending existing implementations is preferable. We demonstrate an example to inherit from base classes and implement a custom deepseek model below (assumed path: `vllm_ascend/models/deepseek_v2.py`).
 ```python
 from typing import List, Optional
 import torch
 from vllm.attention import AttentionMetadata
 from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM
 from vllm.sequence import IntermediateTensors
 class CustomDeepseekV2ForCausalLM(DeepseekV2ForCausalLM):
    # Define merged weights for quantization/efficiency
    packed_modules_mapping = {
        "gate_up_proj": ["gate_proj", "up_proj"],
        "experts": ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"]
    }
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: Optional[List[torch.Tensor]] = None,
        attn_metadata: Optional[AttentionMetadata] = None,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
        # Custom forward logic
        hidden_states = self.model(
            input_ids, 
            positions, 
            kv_caches,
            attn_metadata, 
            intermediate_tensors,
            inputs_embeds
        )
        return hidden_states
 ```
 :::{note}
 For a complete implementation reference, see: `vllm_ascend/models/deepseek_v2.py`.
 :::
 ## Step 2: Registering Custom Models using ModelRegistry Plugins in vLLM
 vllm provides a plugin mechanism for registering externally implemented models without modifying its codebase.
 To integrate your implemented model from `vllm_ascend/models/` directory:
 1. Import your model implementation in `vllm_ascend/models/__init__.py` using relative imports.
 2. Register the model wrapper class via `vllm.ModelRegistry.register_model()` function.
 **Reference Registration Template** (an example of registering new models in `vllm_ascend/models/__init__.py`):
 ```python
 from vllm import ModelRegistry
 def register_model():
    from .custom_model import CustomModelForCausalLM        # New custom model
    from .deepseek_v2 import ModifiedDeepseekV2ForCausalLM  # Customized Deepseek
    # For NEW architectures: Register with unique name
    ModelRegistry.register_model(
        "CustomModelForCausalLM",  # Must match config.json's 'architectures'
        "vllm_ascend.models.custom_model:CustomModelForCausalLM"
    )
    # For MODIFIED architectures: Use original name
    ModelRegistry.register_model(
        "DeepseekV2ForCausalLM",   # Original architecture identifier in vLLM
        "vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM  "
    )
 ```
 :::{note}
 The first argument of `vllm.ModelRegistry.register_model()` indicates the unique architecture identifier which must match `architectures` in `config.json` of the model.
 ```json
 {
  "architectures": [
    "CustomModelForCausalLM"
  ],
 }
 ```
 :::
 ## Step 3: Verification
 ### Case 1: Overriding Existing vLLM Model Architecture
 If you're registering a customized model architecture based on vllm's existing implementation (overriding vllm's original class), when executing vllm offline/online inference (using any model), you'll observe warning logs similar to the following output from `vllm/models_executor/models/registry.py`.
 ```bash
 Model architecture DeepseekV2ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend/models/deepseek_v2:CustomDeepseekV2ForCausalLM.
 ```
 ### Case 2: Registering New Model Architecture
 If you're registering a novel model architecture not present in vllm (creating a completely new class), current logs won't provide explicit confirmation by default. It's recommended to add the following logging statement at the end of the `register_model` method in `vllm/models_executor/models/registry.py`.
 ```python
 logger.info(f"model_arch: {model_arch} has been registered here!")
 ```
 After adding this line, you will see confirmation logs shown below when running vllm offline/online inference (using any model).
 ```bash
 model_arch: CustomModelForCausalLM has been registered here!
 ```
 This log output confirms your novel model architecture has been successfully registered in vllm.
 ## Step 4: Testing
 After adding a new model, we should do basic functional test (offline/online inference), accuracy test and performance benchmark for the model.
 Find more details at:
 - [Accuracy test guide](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html)
 - [Performance benchmark guide](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)
 ## Step 5: Updating Supported Models Doc
 At last, if all the steps above are completed, you should add the new model into our [Supported Models](https://vllm-ascend.readthedocs.io/en/latest/user_guide/supported_models.html) doc.
--- a/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md
+++ b/docs/source/developer_guide/modeling/adding_a_new_multimodal_model.md
@@ -0,0 +1,3 @@
 # Adding a New Multi-Modal Model
 **_Comming soon ..._**
--- a/docs/source/developer_guide/modeling/index.md
+++ b/docs/source/developer_guide/modeling/index.md
@@ -0,0 +1,10 @@
 # Modeling
 This section provides tutorials of how to implement and register a new model into vllm-ascend.
 :::{toctree}
 :caption: Modeling
 :maxdepth: 1
 adding_a_new_model
 adding_a_new_multimodal_model
 :::
--- a/docs/source/developer_guide/performance/index.md
+++ b/docs/source/developer_guide/performance/index.md
@@ -0,0 +1,9 @@
 # Performance
 :::{toctree}
 :caption: Performance
 :maxdepth: 1
 performance_benchmark
 profile_execute_duration
 optimization_and_tuning
 :::
--- a/docs/source/developer_guide/performance/optimization_and_tuning.md
+++ b/docs/source/developer_guide/performance/optimization_and_tuning.md
@@ -0,0 +1,183 @@
 # Optimization and Tuning
 This guide aims to help users to improve vllm-ascend performance on system level. It includes OS configuration, library optimization, deploy guide and so on. Any feedback is welcome.
 ## Preparation
 Run the container:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci0
 # Update the cann base image
 export IMAGE=m.daocloud.io/quay.io/ascend/cann:|cann_image_tag|
 docker run --rm \
 --name performance-test \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -it $IMAGE bash
 ```
 Configure your environment:
 ```{code-block} bash
   :substitutions:
 # Configure the mirror
 echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" > /etc/apt/sources.list && \
 echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-updates main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-backports main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list && \
 echo "deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu-ports/ jammy-security main restricted universe multiverse" >> /etc/apt/sources.list
 # Install os packages
 apt update && apt install wget gcc g++ libnuma-dev git vim -y
 ```
 Install vllm and vllm-ascend:
 ```{code-block} bash
   :substitutions:
 # Install necessary dependencies
 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
 pip install modelscope pandas datasets gevent sacrebleu rouge_score pybind11 pytest
 # Configure this var to speed up model download
 VLLM_USE_MODELSCOPE=true
 ```
 Please follow the [Installation Guide](https://vllm-ascend.readthedocs.io/en/latest/installation.html) to make sure vllm, vllm-ascend and mindie-turbo is installed correctly.
 :::{note}
 Make sure your vllm and vllm-ascend are installed after your python configuration completed, because these packages will build binary files using the python in current environment. If you install vllm, vllm-ascend and mindie-turbo before chapter 1.1, the binary files will not use the optimized python.
 :::
 ## Optimizations
 ### 1. Compilation Optimization
 #### 1.1. Install optimized `python`
 Python supports **LTO** and **PGO** optimization starting from version `3.6` and above, which can be enabled at compile time. And we have offered compilation optimized `python` packages directly to users for the sake of convenience. You can also reproduce the `python` build follow this [tutorial](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0063.html) according to your specific scenarios.
 ```{code-block} bash
   :substitutions:
 mkdir -p /workspace/tmp
 cd /workspace/tmp
 # Download prebuilt lib and packages
 wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libcrypto.so.1.1
 wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libomp.so
 wget https://repo.oepkgs.net/ascend/pytorch/vllm/lib/libssl.so.1.1
 wget https://repo.oepkgs.net/ascend/pytorch/vllm/python/py311_bisheng.tar.gz
 # Configure python and pip
 cp ./*.so* /usr/local/lib
 tar -zxvf ./py311_bisheng.*  -C /usr/local/
 mv  /usr/local/py311_bisheng/  /usr/local/python
 sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3
 sed -i "1c#\!/usr/local/python/bin/python3.11" /usr/local/python/bin/pip3.11
 ln -sf  /usr/local/python/bin/python3  /usr/bin/python
 ln -sf  /usr/local/python/bin/python3  /usr/bin/python3
 ln -sf  /usr/local/python/bin/python3.11  /usr/bin/python3.11
 ln -sf  /usr/local/python/bin/pip3  /usr/bin/pip3
 ln -sf  /usr/local/python/bin/pip3  /usr/bin/pip
 export PATH=/usr/bin:/usr/local/python/bin:$PATH
 ```
 ### 2. OS Optimization
 #### 2.1. jemalloc
 **jemalloc** is a memory allocator that improves performance for multi-threads scenario and can reduce memory fragment. jemalloc use thread local memory manager to allocate variables, which can avoid lock competition between multi-threads and can hugely optimize performance.
 ```{code-block} bash
   :substitutions:
 # Install jemalloc
 sudo apt update
 sudo apt install libjemalloc2
 # Configure jemalloc
 export LD_PRELOAD=/usr/lib/"$(uname -i)"-linux-gnu/libjemalloc.so.2 $LD_PRELOAD
 ```
 #### 2.2. Tcmalloc
 **Tcmalloc (Thread Counting Malloc)** is a universal memory allocator that improves overall performance while ensuring low latency by introducing a multi-level cache structure, reducing mutex competition and optimizing large object processing flow. Find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/700/ptmoddevg/trainingmigrguide/performance_tuning_0068.html).
 ```{code-block} bash
   :substitutions:
 # Install tcmalloc
 sudo apt update
 sudo apt install libgoogle-perftools4 libgoogle-perftools-dev
 # Get the location of libtcmalloc.so*
 find /usr -name libtcmalloc.so*
 # Make the priority of tcmalloc higher
 # The <path> is the location of libtcmalloc.so we get from the upper command
 # Example: "$LD_PRELOAD:/usr/lib/aarch64-linux-gnu/libtcmalloc.so"
 export LD_PRELOAD="$LD_PRELOAD:<path>"
 # Verify your configuration
 # The path of libtcmalloc.so will be contained in the result if your configuration is valid
 ldd `which python`
 ```
 ### 3. `torch_npu` Optimization
 Some performance tuning features in `torch_npu` are controlled by environment variables. Some features and their related environment variables are shown below.
 Memory optimization:
 ```{code-block} bash
   :substitutions:
 # Upper limit of memory block splitting allowed (MB), Setting this parameter can prevent large memory blocks from being split.
 export PYTORCH_NPU_ALLOC_CONF="max_split_size_mb:250"
 # When operators on the communication stream have dependencies, they all need to be ended before being released for reuse. The logic of multi-stream reuse is to release the memory on the communication stream in advance so that the computing stream can be reused.
 export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
 ```
 Schedule optimization:
 ```{code-block} bash
   :substitutions:
 # Optimize operator delivery queue, this will affect the memory peak value, and may degrade if the memory is tight.
 export TASK_QUEUE_ENABLE=2
 # This will greatly improve the CPU bottleneck model and ensure the same performance for the NPU bottleneck model.
 export CPU_AFFINITY_CONF=1
 ```
 ### 4. CANN Optimization
 #### 4.1. HCCL Optimization
 There are some performance tuning features in HCCL, which are controlled by environment variables.
 You can configure HCCL to use "AIV" mode to optimize performance by setting the environment variable shown below. In "AIV" mode, the communication is scheduled by AI vector core directly with ROCE, instead of being scheduled by AI cpu.
 ```{code-block} bash
   :substitutions:
 export HCCL_OP_EXPANSION_MODE="AIV"
 ```
 Plus, there are more features for performance optimization in specific scenarios, which are shown below.
 - `HCCL_INTRA_ROCE_ENABLE`: Use RDMA link instead of SDMA link between two 8Ps as the mesh interconnect link, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0044.html).
 - `HCCL_RDMA_TC`: Use this var to configure traffic class of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0045.html).
 - `HCCL_RDMA_SL`: Use this var to configure service level of RDMA network card, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0046.html).
 - `HCCL_BUFFSIZE`: Use this var to control the cache size for sharing data between two NPUs, find more details [here](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0047.html).
--- a/docs/source/developer_guide/performance/performance_benchmark.md
+++ b/docs/source/developer_guide/performance/performance_benchmark.md
@@ -0,0 +1,194 @@
 # Performance Benchmark
 This document details the benchmark methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. To maintain alignment with vLLM, we use the [benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) script provided by the vllm project.
 **Benchmark Coverage**: We measure offline e2e latency and throughput, and fixed-QPS online serving benchmarks, for more details see [vllm-ascend benchmark scripts](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks).
 ## 1. Run docker container
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 export IMAGE=m.daocloud.io/quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
 --name vllm-ascend \
 --device $DEVICE \
 --device /dev/davinci_manager \
 --device /dev/devmm_svm \
 --device /dev/hisi_hdc \
 -v /usr/local/dcmi:/usr/local/dcmi \
 -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 -v /etc/ascend_install.info:/etc/ascend_install.info \
 -v /root/.cache:/root/.cache \
 -p 8000:8000 \
 -e VLLM_USE_MODELSCOPE=True \
 -e PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256 \
 -it $IMAGE \
 /bin/bash
 ```
 ## 2. Install dependencies
 ```bash
 cd /workspace/vllm-ascend
 pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 pip install -r benchmarks/requirements-bench.txt
 ```
 ## 3. (Optional)Prepare model weights
 For faster running speed, we recommend downloading the model in advance：
 ```bash
 modelscope download --model LLM-Research/Meta-Llama-3.1-8B-Instruct
 ```
 You can also replace all model paths in the [json](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests) files with your local paths:
 ```bash
 [
  {
    "test_name": "latency_llama8B_tp1",
    "parameters": {
      "model": "your local model path",
      "tensor_parallel_size": 1,
      "load_format": "dummy",
      "num_iters_warmup": 5,
      "num_iters": 15
    }
  }
 ]
 ```
 ## 4. Run benchmark script
 Run benchmark script:
 ```bash
 bash benchmarks/scripts/run-performance-benchmarks.sh
 ```
 After about 10 mins, the output is as shown below:
 ```bash
 online serving:
 qps 1:
 ============ Serving Benchmark Result ============
 Successful requests:                     200       
 Benchmark duration (s):                  212.77    
 Total input tokens:                      42659     
 Total generated tokens:                  43545     
 Request throughput (req/s):              0.94      
 Output token throughput (tok/s):         204.66    
 Total Token throughput (tok/s):          405.16    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          104.14    
 Median TTFT (ms):                        102.22    
 P99 TTFT (ms):                           153.82    
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          38.78     
 Median TPOT (ms):                        38.70     
 P99 TPOT (ms):                           48.03     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           38.46     
 Median ITL (ms):                         36.96     
 P99 ITL (ms):                            75.03     
 ==================================================
 qps 4:
 ============ Serving Benchmark Result ============
 Successful requests:                     200       
 Benchmark duration (s):                  72.55     
 Total input tokens:                      42659     
 Total generated tokens:                  43545     
 Request throughput (req/s):              2.76      
 Output token throughput (tok/s):         600.24    
 Total Token throughput (tok/s):          1188.27   
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          115.62    
 Median TTFT (ms):                        109.39    
 P99 TTFT (ms):                           169.03    
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          51.48     
 Median TPOT (ms):                        52.40     
 P99 TPOT (ms):                           69.41     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           50.47     
 Median ITL (ms):                         43.95     
 P99 ITL (ms):                            130.29    
 ==================================================
 qps 16:
 ============ Serving Benchmark Result ============
 Successful requests:                     200       
 Benchmark duration (s):                  47.82     
 Total input tokens:                      42659     
 Total generated tokens:                  43545     
 Request throughput (req/s):              4.18      
 Output token throughput (tok/s):         910.62    
 Total Token throughput (tok/s):          1802.70   
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          128.50    
 Median TTFT (ms):                        128.36    
 P99 TTFT (ms):                           187.87    
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          83.60     
 Median TPOT (ms):                        77.85     
 P99 TPOT (ms):                           165.90    
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           65.72     
 Median ITL (ms):                         54.84     
 P99 ITL (ms):                            289.63    
 ==================================================
 qps inf:
 ============ Serving Benchmark Result ============
 Successful requests:                     200       
 Benchmark duration (s):                  41.26     
 Total input tokens:                      42659     
 Total generated tokens:                  43545     
 Request throughput (req/s):              4.85      
 Output token throughput (tok/s):         1055.44   
 Total Token throughput (tok/s):          2089.40   
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          3394.37   
 Median TTFT (ms):                        3359.93   
 P99 TTFT (ms):                           3540.93   
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          66.28     
 Median TPOT (ms):                        64.19     
 P99 TPOT (ms):                           97.66     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           56.62     
 Median ITL (ms):                         55.69     
 P99 ITL (ms):                            82.90     
 ==================================================
 offline:
 latency:
 Avg latency: 4.944929537673791 seconds
 10% percentile latency: 4.894104263186454 seconds
 25% percentile latency: 4.909652255475521 seconds
 50% percentile latency: 4.932477846741676 seconds
 75% percentile latency: 4.9608619548380375 seconds
 90% percentile latency: 5.035418218374252 seconds
 99% percentile latency: 5.052476694583893 seconds
 throughput:
 Throughput: 4.64 requests/s, 2000.51 total tokens/s, 1010.54 output tokens/s
 Total num prompt tokens:  42659
 Total num output tokens:  43545
 ```
 The result json files are generated into the path `benchmark/results`
 These files contain detailed benchmarking results for further analysis.
 ```bash
 .
 |-- latency_llama8B_tp1.json
 |-- serving_llama8B_tp1_qps_1.json
 |-- serving_llama8B_tp1_qps_16.json
 |-- serving_llama8B_tp1_qps_4.json
 |-- serving_llama8B_tp1_qps_inf.json
 `-- throughput_llama8B_tp1.json
 ```
--- a/docs/source/developer_guide/performance/profile_execute_duration.md
+++ b/docs/source/developer_guide/performance/profile_execute_duration.md
@@ -0,0 +1,40 @@
 # Profile Execute Duration
 The execution duration of each stage (including pre/post-processing, model forward, etc.) usually needs to be captured during a complete inference process. Typically, this is done by using `torch.npu.synchronize()` and obtaining CPU timestamps, which increases the performance overhead of host/device synchronization.
 **To reduce the performance overhead, we add this feature, using the NPU event timestamp mechanism to observe the device execution time asynchronously.**
 ## Usage
 * Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to enable this feature.
 * Use the non-blocking API `ProfileExecuteDuration().capture_async` to set observation points asynchronously when you need to observe the execution duration.
 * Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an appropriate time to get and print the execution durations of all observed stages.
 **We have instrumented the key inference stages (including pre-processing, model forward pass, etc.) for execute duration profiling. Execute the script as follows:**
 ```
 VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE=1 python3 vllm-ascend/examples/offline_inference_npu.py
 ```
 ## Example Output
 ```
 5691:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.17ms [prepare input and forward]:9.57ms [forward]:4.14ms
 5695:(IntegratedWorker pid=1502285) Profile execute duration [Decode]: [post process]:14.29ms [prepare input and forward]:10.19ms [forward]:4.14ms
 5697:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.81ms [prepare input and forward]:10.29ms [forward]:3.99ms
 5701:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.10ms [prepare input and forward]:10.62ms [forward]:4.33ms
 5705:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.65ms [prepare input and forward]:9.58ms [forward]:4.20ms
 5709:(IntegratedWorker pid=1502343) Profile execute duration [Decode]: [post process]:14.43ms [prepare input and forward]:9.88ms [forward]:4.20ms
 5711:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.89ms [prepare input and forward]:10.49ms [forward]:4.19ms
 5715:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.14ms [prepare input and forward]:11.21ms [forward]:4.18ms
 5719:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.71ms [prepare input and forward]:10.15ms [forward]:4.42ms
 5723:(IntegratedWorker pid=1502401) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.31ms [forward]:4.25ms
 5725:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.12ms [prepare input and forward]:10.33ms [forward]:4.24ms
 5729:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.58ms [prepare input and forward]:10.85ms [forward]:4.32ms
 5733:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:14.32ms [prepare input and forward]:9.79ms [forward]:4.28ms
 5737:(IntegratedWorker pid=1502462) Profile execute duration [Decode]: [post process]:15.06ms [prepare input and forward]:9.89ms [forward]:4.32ms
 5739:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.62ms [prepare input and forward]:10.48ms [forward]:4.27ms
 5743:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.60ms [prepare input and forward]:10.71ms [forward]:4.61ms
 5747:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:14.21ms [prepare input and forward]:10.10ms [forward]:4.52ms
 5751:(IntegratedWorker pid=1502524) Profile execute duration [Decode]: [post process]:15.03ms [prepare input and forward]:10.00ms [forward]:4.42ms
 ```
--- a/docs/source/faqs.md
+++ b/docs/source/faqs.md
@@ -0,0 +1,198 @@
 # FAQs
 ## Version Specific FAQs
 - [[v0.9.1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2643)
 - [[v0.10.1rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2630)
 ## General FAQs
 ### 1. What devices are currently supported?
 Currently, **ONLY** Atlas A2 series(Ascend-cann-kernels-910b)，Atlas A3 series(Atlas-A3-cann-kernels) and Atlas 300I(Ascend-cann-kernels-310p) series are supported:
 - Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2)
 - Atlas 800I A2 Inference series (Atlas 800I A2)
 - Atlas A3 Training series (Atlas 800T A3, Atlas 900 A3 SuperPoD, Atlas 9000 A3 SuperPoD)
 - Atlas 800I A3 Inference series (Atlas 800I A3)
 - [Experimental] Atlas 300I Inference series (Atlas 300I Duo)
 Below series are NOT supported yet:
 - Atlas 200I A2 (Ascend-cann-kernels-310b) unplanned yet
 - Ascend 910, Ascend 910 Pro B (Ascend-cann-kernels-910) unplanned yet
 From a technical view, vllm-ascend support would be possible if the torch-npu is supported. Otherwise, we have to implement it by using custom ops. We are also welcome to join us to improve together.
 ### 2. How to get our docker containers?
 You can get our containers at `Quay.io`, e.g., [<u>vllm-ascend</u>](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and [<u>cann</u>](https://quay.io/repository/ascend/cann?tab=tags).
 If you are in China, you can use `daocloud` to accelerate your downloading:
 ```bash
 # Replace with tag you want to pull
 TAG=v0.7.3rc2
 docker pull m.daocloud.io/quay.io/ascend/vllm-ascend:$TAG
 ```
 #### Load Docker Images for offline environment
 If you want to use container image for offline environments (no internet connection), you need to download container image in a environment with internet access:
 **Exporting Docker images:**
 ```{code-block} bash
   :substitutions:
 # Pull the image on a machine with internet access
 TAG=|vllm_ascend_version|
 docker pull quay.io/ascend/vllm-ascend:$TAG
 # Export the image to a tar file and compress to tar.gz
 docker save quay.io/ascend/vllm-ascend:$TAG | gzip > vllm-ascend-$TAG.tar.gz
 ```
 **Importing Docker images in environment without internet access:**
 ```{code-block} bash
   :substitutions:
 # Transfer the tar/tar.gz file to the offline environment and load it
 TAG=|vllm_ascend_version|
 docker load -i vllm-ascend-$TAG.tar.gz
 # Verify the image is loaded
 docker images | grep vllm-ascend
 ```
 ### 3. What models does vllm-ascend supports?
 Find more details [<u>here</u>](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html).
 ### 4. How to get in touch with our community?
 There are many channels that you can communicate with our community developers / users:
 - Submit a GitHub [<u>issue</u>](https://github.com/vllm-project/vllm-ascend/issues?page=1).
 - Join our [<u>weekly meeting</u>](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z) and share your ideas.
 - Join our [<u>WeChat</u>](https://github.com/vllm-project/vllm-ascend/issues/227) group and ask your quenstions.
 - Join our ascend channel in [<u>vLLM forums</u>](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support/6) and publish your topics.
 ### 5. What features does vllm-ascend V1 supports?
 Find more details [<u>here</u>](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html).
 ### 6. How to solve the problem of "Failed to infer device type" or "libatb.so: cannot open shared object file"?
 Basically, the reason is that the NPU environment is not configured correctly. You can:
 1. try `source /usr/local/Ascend/nnal/atb/set_env.sh` to enable NNAL package.
 2. try `source /usr/local/Ascend/ascend-toolkit/set_env.sh` to enable CANN package.
 3. try `npu-smi info` to check whether the NPU is working.
 If all above steps are not working, you can try the following code with python to check whether there is any error:
 ```
 import torch
 import torch_npu
 import vllm
 ```
 If all above steps are not working, feel free to submit a GitHub issue.
 ### 7. How does vllm-ascend perform?
 Currently, only some models are improved. Such as `Qwen2.5 VL`, `Qwen3`, `Deepseek  V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek works with graph mode to play a good performance. What's more, you can install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference as well.
 ### 8. How vllm-ascend work with vllm?
 vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is the same as the version of vllm. For example, if you use vllm 0.7.3, you should use vllm-ascend 0.7.3 as well. For main branch, we will make sure `vllm-ascend` and `vllm` are compatible by each commit.
 ### 9. Does vllm-ascend support Prefill Disaggregation feature?
 Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND support, We will make it stable and supported by vllm-ascend in the future.
 ### 10. Does vllm-ascend support quantization method?
 Currently, w8a8 quantization is already supported by vllm-ascend originally on v0.8.4rc2 or higher, If you're using vllm 0.7.3 version, w8a8 quantization is supporeted with the integration of vllm-ascend and mindie-turbo, please use `pip install vllm-ascend[mindie-turbo]`.
 ### 11. How to run w8a8 DeepSeek model?
 Please following the [inferencing tutorail](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html) and replace model to DeepSeek.
 ### 12. There is no output in log when loading models using vllm-ascend, How to solve it?
 If you're using vllm 0.7.3 version, this is a known progress bar display issue in VLLM, which has been resolved in [this PR](https://github.com/vllm-project/vllm/pull/12428), please cherry-pick it locally by yourself. Otherwise, please fill up an issue.
 ### 13. How vllm-ascend is tested
 vllm-ascend is tested by functional test, performance test and accuracy test.
 - **Functional test**: we added CI, includes portion of vllm's native unit tests and vllm-ascend's own unit tests，on vllm-ascend's test, we test basic functionality、popular models availability and [supported features](https://vllm-ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html) via e2e test
 - **Performance test**: we provide [benchmark](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks) tools for end-to-end performance benchmark which can easily to re-route locally, we'll publish a perf website to show the performance test results for each pull request
 - **Accuracy test**: we're working on adding accuracy test to CI as well.
 Finnall, for each release, we'll publish the performance test and accuracy test report in the future.
 ### 14. How to fix the error "InvalidVersion" when using vllm-ascend?
 It's usually because you have installed an dev/editable version of vLLM package. In this case, we provide the env variable `VLLM_VERSION` to let users specify the version of vLLM package to use. Please set the env variable `VLLM_VERSION` to the version of vLLM package you have installed. The format of `VLLM_VERSION` should be `X.Y.Z`.
 ### 15. How to handle Out Of Memory?
 OOM errors typically occur when the model exceeds the memory capacity of a single NPU. For general guidance, you can refer to [vLLM's OOM troubleshooting documentation](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-of-memory).
 In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, dynamic memory allocation/deallocation during inference can exacerbate memory fragmentation, leading to OOM. To address this:
 - **Adjust `--gpu-memory-utilization`**: If unspecified, will use the default value of `0.9`. You can decrease this param to reserve more memory to reduce fragmentation risks. See more note in: [vLLM - Inference and Serving - Engine Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-_engine_args_parser-cacheconfig).
 - **Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to optimize NPU memory management. For example, you can `export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory feature to mitigate memory fragmentation caused by frequent dynamic memory size adjustments during runtime, see more note in: [PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html).
 ### 16. Failed to enable NPU graph mode when running DeepSeek?
 You may encounter the following error if running DeepSeek with NPU graph mode enabled. The allowed number of queries per kv when enabling both MLA and Graph mode only support {32, 64, 128}, **Thus this is not supported for DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode support on DeepSeek-V2-Lite will be done in the future.
 And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}.
 ```bash
 [rank0]: RuntimeError: EZ9999: Inner Error!
 [rank0]: EZ9999: [PID: 62938] 2025-05-27-06:52:12.455.807 numHeads / numKvHeads = 8, MLA only support {32, 64, 128}.[FUNC:CheckMlaAttrs][FILE:incre_flash_attention_tiling_check.cc][LINE:1218]
 ```
 ### 17. Failed to reinstall vllm-ascend from source after uninstalling vllm-ascend?
 You may encounter the problem of C compilation failure when reinstalling vllm-ascend from source using pip. If the installation fails, it is recommended to use `python setup.py install` to install, or use `python setup.py clean` to clear the cache.
 ### 18. How to generate determinitic results when using vllm-ascend?
 There are several factors that affect output certainty:
 1. Sampler Method: using **Greedy sample** by setting `temperature=0` in `SamplingParams`, e.g.:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0)
 # Create an LLM.
 llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 2. Set the following enveriments parameters:
 ```bash
 export LCCL_DETERMINISTIC=1
 export HCCL_DETERMINISTIC=true
 export ATB_MATMUL_SHUFFLE_K_ENABLE=0
 export ATB_LLM_LCOC_ENABLE=0
 ```
 ### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model？
 The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`,
 this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly.
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -0,0 +1,71 @@
 # Welcome to vLLM Ascend Plugin
 :::{figure} ./logos/vllm-ascend-logo-text-light.png
 :align: center
 :alt: vLLM
 :class: no-scaled-link
 :width: 70%
 :::
 :::{raw} html
 <p style="text-align:center">
 <strong>vLLM Ascend Plugin
 </strong>
 </p>
 <p style="text-align:center">
 <script async defer src="https://buttons.github.io/buttons.js"></script>
 <a class="github-button" href="https://github.com/vllm-project/vllm-ascend" data-show-count="true" data-size="large" aria-label="Star">Star</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm-ascend/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
 <a class="github-button" href="https://github.com/vllm-project/vllm-ascend/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
 :::
 vLLM Ascend plugin (vllm-ascend) is a community maintained hardware plugin for running vLLM on the Ascend NPU.
 This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
 By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
 ## Documentation
 % How to start using vLLM on Ascend NPU?
 :::{toctree}
 :caption: Getting Started
 :maxdepth: 1
 quick_start
 installation
 tutorials/index.md
 faqs
 :::
 % What does vLLM Ascend Plugin support?
 :::{toctree}
 :caption: User Guide
 :maxdepth: 1
 user_guide/support_matrix/index
 user_guide/configuration/index
 user_guide/feature_guide/index
 user_guide/release_notes
 :::
 % How to contribute to the vLLM Ascend project
 :::{toctree}
 :caption: Developer Guide
 :maxdepth: 1
 developer_guide/contribution/index
 developer_guide/feature_guide/index
 developer_guide/evaluation/index
 developer_guide/performance/index
 developer_guide/modeling/index
 :::
 % How to involve vLLM Ascend
 :::{toctree}
 :caption: Community
 :maxdepth: 1
 community/governance
 community/contributors
 community/versioning_policy
 community/user_stories/index
 :::
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -0,0 +1,282 @@
 # Installation
 This document describes how to install vllm-ascend manually.
 ## Requirements
 - OS: Linux
 - Python: >= 3.9, < 3.12
 - A hardware with Ascend NPU. It's usually the Atlas 800 A2 series.
 - Software:
    | Software      | Supported version                | Note                                      |
    |---------------|----------------------------------|-------------------------------------------|
    | CANN          | >= 8.2.RC1                       | Required for vllm-ascend and torch-npu    |
    | torch-npu     | >= 2.7.1.dev20250724             | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
    | torch         | >= 2.7.1                         | Required for torch-npu and vllm           |
 You have 2 way to install:
 - **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.
 - **Using docker**: use the `vllm-ascend` pre-built docker image directly.
 ## Configure a new environment
 Before installing, you need to make sure firmware/driver and CANN are installed correctly, refer to [link](https://ascend.github.io/docs/sources/ascend/quick_install.html) for more details.
 ### Configure hardware environment
 To verify that the Ascend NPU firmware and driver were correctly installed, run:
 ```bash
 npu-smi info
 ```
 Refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) for more details.
 ### Configure software environment
 :::::{tab-set}
 :sync-group: install
 ::::{tab-item} Before using pip
 :selected:
 :sync: pip
 The easiest way to prepare your software environment is using CANN image directly:
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/cann:|cann_image_tag|
 docker run --rm \
    --name vllm-ascend-env \
    --device $DEVICE \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v /root/.cache:/root/.cache \
    -it $IMAGE bash
 ```
 :::{dropdown} Click here to see "Install CANN manually"
 :animate: fade-in-slide-down
 You can also install CANN manually:
 ```bash
 # Create a virtual environment
 python -m venv vllm-ascend-env
 source vllm-ascend-env/bin/activate
 # Install required python packages.
 pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
 # Download and install the CANN package.
 wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run --full
 # https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-aarch64.run
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run --install
 wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run
 chmod +x ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run
 ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run --install
 source /usr/local/Ascend/nnal/atb/set_env.sh
 ```
 :::
 ::::
 ::::{tab-item} Before using docker
 :sync: docker
 No more extra step if you are using `vllm-ascend` prebuilt docker image.
 ::::
 :::::
 Once it's done, you can start to set up `vllm` and `vllm-ascend`.
 ## Setup vllm and vllm-ascend
 :::::{tab-set}
 :sync-group: install
 ::::{tab-item} Using pip
 :selected:
 :sync: pip
 First install system dependencies and config pip mirror:
 ```bash
 # Using apt-get with mirror
 sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
 apt-get update -y && apt-get install -y gcc g++ cmake libnuma-dev wget git curl jq
 # Or using yum
 # yum update -y && yum install -y gcc g++ cmake numactl-devel wget git curl jq
 # Config pip mirror
 pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
 ```
 **[Optional]** Then config the extra-index of `pip` if you are working on a x86 machine or using torch-npu dev version:
 ```bash
 # For torch-npu dev version or x86 machine
 pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/ https://mirrors.huaweicloud.com/ascend/repos/pypi"
 ```
 Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:
 ```{code-block} bash
   :substitutions:
 # Install vllm-project/vllm from pypi
 pip install vllm==|pip_vllm_version|
 # Install vllm-project/vllm-ascend from pypi.
 pip install vllm-ascend==|pip_vllm_ascend_version|
 ```
 :::{dropdown} Click here to see "Build from source code"
 or build from **source code**:
 ```{code-block} bash
   :substitutions:
 # Install vLLM
 git clone --depth 1 --branch |vllm_version| https://github.com/vllm-project/vllm
 cd vllm
 VLLM_TARGET_DEVICE=empty pip install -v -e .
 cd ..
 # Install vLLM Ascend
 git clone  --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
 pip install -v -e .
 cd ..
 ```
 vllm-ascend will build custom ops by default. If you don't want to build it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it.
 :::
 ```{note}
 If you are building from v0.7.3-dev and intend to use sleep mode feature, you should set `COMPILE_CUSTOM_KERNELS=1` manually.
 To build custom ops, gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using `pip install -e .` and encounter a torch-npu version conflict, please install with `pip install --no-build-isolation -e .` to build on system env.
 If you encounter other problems during compiling, it is probably because unexpected compiler is being used, you may export `CXX_COMPILER` and `C_COMPILER` in env to specify your g++ and gcc locations before compiling.
 ```
 ::::
 ::::{tab-item} Using docker
 :sync: docker
 You can just pull the **prebuilt image** and run it with bash.
 :::{dropdown} Click here to see "Build from Dockerfile"
 or build IMAGE from **source code**:
 ```bash
 git clone https://github.com/vllm-project/vllm-ascend.git
 cd vllm-ascend
 docker build -t vllm-ascend-dev-image:latest -f ./Dockerfile .
 ```
 :::
 ```{code-block} bash
   :substitutions:
 # Update DEVICE according to your device (/dev/davinci[0-7])
 export DEVICE=/dev/davinci7
 # Update the vllm-ascend image
 export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
 docker run --rm \
    --name vllm-ascend-env \
    --device $DEVICE \
    --device /dev/davinci_manager \
    --device /dev/devmm_svm \
    --device /dev/hisi_hdc \
    -v /usr/local/dcmi:/usr/local/dcmi \
    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
    -v /etc/ascend_install.info:/etc/ascend_install.info \
    -v /root/.cache:/root/.cache \
    -it $IMAGE bash
 ```
 The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed in `/vllm-workspace` and installed in [development mode](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to help developer immediately take place changes without requiring a new installation.
 ::::
 :::::
 ## Extra information
 ### Verify installation
 Create and run a simple inference test. The `example.py` can be like:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 # Create an LLM.
 llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 Then run:
 ```bash
 # Try `export VLLM_USE_MODELSCOPE=true` and `pip install modelscope`
 # to speed up download if huggingface is not reachable.
 python example.py
 ```
 The output will be like:
 ```bash
 INFO 02-18 08:49:58 __init__.py:28] Available plugins for group vllm.platform_plugins:
 INFO 02-18 08:49:58 __init__.py:30] name=ascend, value=vllm_ascend:register
 INFO 02-18 08:49:58 __init__.py:32] all available plugins for group vllm.platform_plugins will be loaded.
 INFO 02-18 08:49:58 __init__.py:34] set environment variable VLLM_PLUGINS to control which plugins to load.
 INFO 02-18 08:49:58 __init__.py:42] plugin ascend loaded.
 INFO 02-18 08:49:58 __init__.py:174] Platform plugin ascend is activated
 INFO 02-18 08:50:12 config.py:526] This model supports multiple tasks: {'embed', 'classify', 'generate', 'score', 'reward'}. Defaulting to 'generate'.
 INFO 02-18 08:50:12 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='./Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='./Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=npu, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=./Qwen2.5-0.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False,
 Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
 Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.86it/s]
 Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  5.85it/s]
 INFO 02-18 08:50:24 executor_base.py:108] # CPU blocks: 35064, # CPU blocks: 2730
 INFO 02-18 08:50:24 executor_base.py:113] Maximum concurrency for 32768 tokens per request: 136.97x
 INFO 02-18 08:50:25 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 3.87 seconds
 Processed prompts: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.46it/s, est. speed input: 46.55 toks/s, output: 135.41 toks/s]
 Prompt: 'Hello, my name is', Generated text: " Shinji, a teenage boy from New York City. I'm a computer science"
 Prompt: 'The president of the United States is', Generated text: ' a very important person. When he or she is elected, many people think that'
 Prompt: 'The capital of France is', Generated text: ' Paris. The oldest part of the city is Saint-Germain-des-Pr'
 Prompt: 'The future of AI is', Generated text: ' not bright\n\nThere is no doubt that the evolution of AI will have a huge'
 ```
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/contributors.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/contributors.po
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/governance.po
@@ -0,0 +1,204 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../community/governance.md:1
 msgid "Governance"
 msgstr "治理"
 #: ../../community/governance.md:3
 msgid "Mission"
 msgstr "使命"
 #: ../../community/governance.md:4
 msgid ""
 "As a vital component of vLLM, the vLLM Ascend project is dedicated to "
 "providing an easy, fast, and cheap LLM Serving for Everyone on Ascend NPU, "
 "and to actively contribute to the enrichment of vLLM."
 msgstr ""
 "作为 vLLM 的重要组成部分，vLLM Ascend 项目致力于为所有人在 Ascend NPU 上提供简单、快速且低成本的大语言模型服务，并积极促进"
 " vLLM 的丰富发展。"
 #: ../../community/governance.md:6
 msgid "Principles"
 msgstr "原则"
 #: ../../community/governance.md:7
 msgid ""
 "vLLM Ascend follows the vLLM community's code of conduct：[vLLM - CODE OF "
 "CONDUCT](https://github.com/vllm-project/vllm/blob/main/CODE_OF_CONDUCT.md)"
 msgstr ""
 "vLLM Ascend 遵循 vLLM 社区的行为准则：[vLLM - 行为准则](https://github.com/vllm-"
 "project/vllm/blob/main/CODE_OF_CONDUCT.md)"
 #: ../../community/governance.md:9
 msgid "Governance - Mechanics"
 msgstr "治理 - 机制"
 #: ../../community/governance.md:10
 msgid ""
 "vLLM Ascend is an open-source project under the vLLM community, where the "
 "authority to appoint roles is ultimately determined by the vLLM community. "
 "It adopts a hierarchical technical governance structure."
 msgstr "vLLM Ascend 是 vLLM 社区下的一个开源项目，其角色任命权最终由 vLLM 社区决定。它采用分层的技术治理结构。"
 #: ../../community/governance.md:12
 msgid "Contributor:"
 msgstr "贡献者："
 #: ../../community/governance.md:14
 msgid ""
 "**Responsibility:** Help new contributors on boarding, handle and respond to"
 " community questions, review RFCs, code"
 msgstr "**职责：** 帮助新贡献者加入，处理和回复社区问题，审查RFC和代码"
 #: ../../community/governance.md:16
 msgid ""
 "**Requirements:** Complete at least 1 contribution. Contributor is someone "
 "who consistently and actively participates in a project, included but not "
 "limited to issue/review/commits/community involvement."
 msgstr "**要求：** 完成至少1次贡献。贡献者是指持续且积极参与项目的人，包括但不限于问题、评审、提交和社区参与。"
 #: ../../community/governance.md:18
 msgid ""
 "Contributors will be empowered [vllm-project/vllm-"
 "ascend](https://github.com/vllm-project/vllm-ascend) Github repo `Triage` "
 "permissions (`Can read and clone this repository. Can also manage issues and"
 " pull requests`) to help community developers collaborate more efficiently."
 msgstr ""
 "贡献者将被赋予 [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-"
 "ascend) Github 仓库的 `Triage` 权限（`可读取和克隆此仓库。还可以管理问题和拉取请求`），以帮助社区开发者更加高效地协作。"
 #: ../../community/governance.md:20
 msgid "Maintainer:"
 msgstr "维护者："
 #: ../../community/governance.md:22
 msgid ""
 "**Responsibility:** Develop the project's vision and mission. Maintainers "
 "are responsible for driving the technical direction of the entire project "
 "and ensuring its overall success, possessing code merge permissions. They "
 "formulate the roadmap, review contributions from community members, "
 "continuously contribute code, and actively engage in community activities "
 "(such as regular meetings/events)."
 msgstr ""
 "**责任：** "
 "制定项目的愿景和使命。维护者负责引领整个项目的技术方向并确保其整体成功，拥有代码合并权限。他们制定路线图，审核社区成员的贡献，持续贡献代码，并积极参与社区活动（如定期会议/活动）。"
 #: ../../community/governance.md:24
 msgid ""
 "**Requirements:** Deep understanding of ‌vLLM‌ and ‌vLLM Ascend‌ codebases, "
 "with a commitment to sustained code contributions. Competency in "
 "‌design/development/PR review workflows‌."
 msgstr ""
 "**要求：** 深入理解 ‌vLLM‌ 和 ‌vLLM Ascend‌ 代码库，并承诺持续贡献代码。具备 ‌设计/开发/PR 审核流程‌ 的能力。"
 #: ../../community/governance.md:25
 msgid ""
 "**Review Quality‌:** Actively participate in community code reviews, "
 "ensuring high-quality code integration."
 msgstr "**评审质量：** 积极参与社区代码评审，确保高质量的代码集成。"
 #: ../../community/governance.md:26
 msgid ""
 "**Quality Contribution‌:** Successfully develop and deliver at least one "
 "major feature while maintaining consistent high-quality contributions."
 msgstr "**质量贡献‌：** 成功开发并交付至少一个主要功能，同时持续保持高质量的贡献。"
 #: ../../community/governance.md:27
 msgid ""
 "**Community Involvement‌:** Actively address issues, respond to forum "
 "inquiries, participate in discussions, and engage in community-driven tasks."
 msgstr "**社区参与：** 积极解决问题，回复论坛询问，参与讨论，并参与社区驱动的任务。"
 #: ../../community/governance.md:29
 msgid ""
 "Requires approval from existing Maintainers. The vLLM community has the "
 "final decision-making authority."
 msgstr "需要现有维护者的批准。vLLM社区拥有最终决策权。"
 #: ../../community/governance.md:31
 msgid ""
 "Maintainer will be empowered [vllm-project/vllm-"
 "ascend](https://github.com/vllm-project/vllm-ascend) Github repo write "
 "permissions (`Can read, clone, and push to this repository. Can also manage "
 "issues and pull requests`)."
 msgstr ""
 "维护者将被授予 [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-"
 "ascend) Github 仓库的写入权限（`可以读取、克隆和推送到此仓库。还可以管理问题和拉取请求`）。"
 #: ../../community/governance.md:33
 msgid "Nominating and Removing Maintainers"
 msgstr "提名和移除维护者"
 #: ../../community/governance.md:35
 msgid "The Principles"
 msgstr "原则"
 #: ../../community/governance.md:37
 msgid ""
 "Membership in vLLM Ascend is given to individuals on merit basis after they "
 "demonstrated strong expertise of the vLLM / vLLM Ascend through "
 "contributions, reviews and discussions."
 msgstr ""
 "vLLM Ascend 的成员资格是基于个人能力授予的，只有在通过贡献、评审和讨论展示出对 vLLM / vLLM Ascend "
 "的深厚专业知识后，才可获得。"
 #: ../../community/governance.md:39
 msgid ""
 "For membership in the maintainer group the individual has to demonstrate "
 "strong and continued alignment with the overall vLLM / vLLM Ascend "
 "principles."
 msgstr "要成为维护者组成员，个人必须表现出与 vLLM / vLLM Ascend 总体原则的高度一致并持续支持。"
 #: ../../community/governance.md:41
 msgid ""
 "Light criteria of moving module maintenance to ‘emeritus’ status if they "
 "don’t actively participate over long periods of time."
 msgstr "如果模块维护人员在长时间内没有积极参与，可根据较宽松的标准将其维护状态转为“荣誉”状态。"
 #: ../../community/governance.md:43
 msgid "The membership is for an individual, not a company."
 msgstr "该会员资格属于个人，而非公司。"
 #: ../../community/governance.md:45
 msgid "Nomination and Removal"
 msgstr "提名与罢免"
 #: ../../community/governance.md:47
 msgid ""
 "Nomination: Anyone can nominate someone to become a maintainer (include "
 "self-nominate). All existing maintainers are responsible for evaluating the "
 "nomination. The nominator should provide nominee's info around the strength "
 "of the candidate to be a maintainer, include but not limited to review "
 "quality, quality contribution, community involvement."
 msgstr ""
 "提名：任何人都可以提名他人成为维护者（包括自荐）。所有现有维护者都有责任评估提名。提名人应提供被提名人成为维护者的相关优势信息，包括但不限于评审质量、优质贡献、社区参与等。"
 #: ../../community/governance.md:48
 msgid ""
 "Removal: Anyone can nominate a person to be removed from maintainer position"
 " (include self-nominate). All existing maintainers are responsible for "
 "evaluating the nomination. The nominator should provide nominee's info, "
 "include but not limited to lack of activity, conflict with the overall "
 "direction and other information that makes them unfit to be a maintainer."
 msgstr ""
 "移除：任何人都可以提名某人被移出维护者职位（包括自荐）。所有现有维护者都有责任评估该提名。提名者应提供被提名人的相关信息，包括但不限于缺乏活动、与整体方向冲突以及使其不适合作为维护者的其他信息。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/index.po
@@ -0,0 +1,103 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../community/user_stories/index.md:15
 msgid "More details"
 msgstr "更多细节"
 #: ../../community/user_stories/index.md:1
 msgid "User Stories"
 msgstr "用户故事"
 #: ../../community/user_stories/index.md:3
 msgid ""
 "Read case studies on how users and developers solves real, everyday problems"
 " with vLLM Ascend"
 msgstr "阅读案例研究，了解用户和开发者如何使用 vLLM Ascend 解决实际日常问题。"
 #: ../../community/user_stories/index.md:5
 msgid ""
 "[LLaMA-Factory](./llamafactory.md) is an easy-to-use and efficient platform "
 "for training and fine-tuning large language models, it supports vLLM Ascend "
 "to speed up inference since [LLaMA-"
 "Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739), gain 2x "
 "performance enhancement of inference."
 msgstr ""
 "[LLaMA-Factory](./llamafactory.md) 是一个易于使用且高效的大语言模型训练与微调平台，自 [LLaMA-"
 "Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739) 起支持 vLLM "
 "Ascend 加速推理，推理性能提升 2 倍。"
 #: ../../community/user_stories/index.md:7
 msgid ""
 "[Huggingface/trl](https://github.com/huggingface/trl) is a cutting-edge "
 "library designed for post-training foundation models using advanced "
 "techniques like SFT, PPO and DPO, it uses vLLM Ascend since "
 "[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) to "
 "support RLHF on Ascend NPU."
 msgstr ""
 "[Huggingface/trl](https://github.com/huggingface/trl) 是一个前沿的库，专为使用 SFT、PPO 和"
 " DPO 等先进技术对基础模型进行后训练而设计。从 "
 "[v0.17.0](https://github.com/huggingface/trl/releases/tag/v0.17.0) 版本开始，该库利用"
 " vLLM Ascend 来支持在 Ascend NPU 上进行 RLHF。"
 #: ../../community/user_stories/index.md:9
 msgid ""
 "[MindIE Turbo](https://pypi.org/project/mindie-turbo) is an LLM inference "
 "engine acceleration plug-in library developed by Huawei on Ascend hardware, "
 "which includes self-developed large language model optimization algorithms "
 "and optimizations related to the inference engine framework. It supports "
 "vLLM Ascend since "
 "[2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-"
 "turbo-0001.html)."
 msgstr ""
 "[MindIE Turbo](https://pypi.org/project/mindie-turbo) "
 "是华为在昇腾硬件上开发的一款用于加速LLM推理引擎的插件库，包含自主研发的大语言模型优化算法及与推理引擎框架相关的优化。从 "
 "[2.0rc1](https://www.hiascend.com/document/detail/zh/mindie/20RC1/AcceleratePlugin/turbodev/mindie-"
 "turbo-0001.html) 起，支持 vLLM Ascend。"
 #: ../../community/user_stories/index.md:11
 msgid ""
 "[GPUStack](https://github.com/gpustack/gpustack) is an open-source GPU "
 "cluster manager for running AI models. It supports vLLM Ascend since "
 "[v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2), see more"
 " GPUStack performance evaluation info on "
 "[link](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)."
 msgstr ""
 "[GPUStack](https://github.com/gpustack/gpustack) 是一个开源的 GPU 集群管理器，用于运行 AI "
 "模型。从 [v0.6.2](https://github.com/gpustack/gpustack/releases/tag/v0.6.2) "
 "版本开始支持 vLLM Ascend，更多 GPUStack 性能评测信息见 "
 "[链接](https://mp.weixin.qq.com/s/pkytJVjcH9_OnffnsFGaew)。"
 #: ../../community/user_stories/index.md:13
 msgid ""
 "[verl](https://github.com/volcengine/verl) is a flexible, efficient and "
 "production-ready RL training library for large language models (LLMs), uses "
 "vLLM Ascend since "
 "[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0), see more "
 "info on [verl x Ascend "
 "Quickstart](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html)."
 msgstr ""
 "[verl](https://github.com/volcengine/verl) "
 "是一个灵活、高效且可用于生产环境的大型语言模型（LLM）强化学习训练库，自 "
 "[v0.4.0](https://github.com/volcengine/verl/releases/tag/v0.4.0) 起支持 vLLM "
 "Ascend，更多信息请参见 [verl x Ascend "
 "快速上手](https://verl.readthedocs.io/en/latest/ascend_tutorial/ascend_quick_start.html)。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/user_stories/llamafactory.po
@@ -0,0 +1,87 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../community/user_stories/llamafactory.md:1
 msgid "LLaMA-Factory"
 msgstr "LLaMA-Factory"
 #: ../../community/user_stories/llamafactory.md:3
 msgid "**About / Introduction**"
 msgstr "**关于 / 介绍**"
 #: ../../community/user_stories/llamafactory.md:5
 msgid ""
 "[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) is an easy-to-use "
 "and efficient platform for training and fine-tuning large language models. "
 "With LLaMA-Factory, you can fine-tune hundreds of pre-trained models locally"
 " without writing any code."
 msgstr ""
 "[LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) "
 "是一个易于使用且高效的平台，用于训练和微调大型语言模型。有了 LLaMA-Factory，你可以在本地对数百个预训练模型进行微调，无需编写任何代码。"
 #: ../../community/user_stories/llamafactory.md:7
 msgid ""
 "LLaMA-Facotory users need to evaluate and inference the model after fine-"
 "tuning the model."
 msgstr "LLaMA-Facotory 用户需要在对模型进行微调后对模型进行评估和推理。"
 #: ../../community/user_stories/llamafactory.md:9
 msgid "**The Business Challenge**"
 msgstr "**业务挑战**"
 #: ../../community/user_stories/llamafactory.md:11
 msgid ""
 "LLaMA-Factory used transformers to perform inference on Ascend NPU, but the "
 "speed was slow."
 msgstr "LLaMA-Factory 使用 transformers 在 Ascend NPU 上进行推理，但速度较慢。"
 #: ../../community/user_stories/llamafactory.md:13
 msgid "**Solving Challenges and Benefits with vLLM Ascend**"
 msgstr "**通过 vLLM Ascend 解决挑战与收益**"
 #: ../../community/user_stories/llamafactory.md:15
 msgid ""
 "With the joint efforts of LLaMA-Factory and vLLM Ascend ([LLaMA-"
 "Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)), the "
 "performance of LLaMA-Factory in the model inference stage has been "
 "significantly improved. According to the test results, the inference speed "
 "of LLaMA-Factory has been increased to 2x compared to the transformers "
 "version."
 msgstr ""
 "在 LLaMA-Factory 和 vLLM Ascend 的共同努力下（参见 [LLaMA-"
 "Factory#7739](https://github.com/hiyouga/LLaMA-Factory/pull/7739)），LLaMA-"
 "Factory 在模型推理阶段的性能得到了显著提升。根据测试结果，LLaMA-Factory 的推理速度相比 transformers 版本提升到了 2"
 " 倍。"
 #: ../../community/user_stories/llamafactory.md:17
 msgid "**Learn more**"
 msgstr "**了解更多**"
 #: ../../community/user_stories/llamafactory.md:19
 msgid ""
 "See more about LLaMA-Factory and how it uses vLLM Ascend for inference on "
 "the Ascend NPU in the following documentation: [LLaMA-Factory Ascend NPU "
 "Inference](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)."
 msgstr ""
 "在以下文档中查看更多关于 LLaMA-Factory 以及其如何在 Ascend NPU 上使用 vLLM Ascend 进行推理的信息：[LLaMA-"
 "Factory Ascend NPU "
 "推理](https://llamafactory.readthedocs.io/en/latest/advanced/npu_inference.html)。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/community/versioning_policy.po
@@ -0,0 +1,624 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../community/versioning_policy.md:1
 msgid "Versioning policy"
 msgstr "版本管理策略"
 #: ../../community/versioning_policy.md:3
 msgid ""
 "Starting with vLLM 0.7.x, the vLLM Ascend Plugin ([vllm-project/vllm-"
 "ascend](https://github.com/vllm-project/vllm-ascend)) project follows the "
 "[PEP 440](https://peps.python.org/pep-0440/) to publish matching with vLLM "
 "([vllm-project/vllm](https://github.com/vllm-project/vllm))."
 msgstr ""
 "从 vLLM 0.7.x 开始，vLLM Ascend 插件（[vllm-project/vllm-"
 "ascend](https://github.com/vllm-project/vllm-ascend)）项目遵循 [PEP "
 "440](https://peps.python.org/pep-0440/) ，以与 vLLM（[vllm-"
 "project/vllm](https://github.com/vllm-project/vllm)）版本匹配发布。"
 #: ../../community/versioning_policy.md:5
 msgid "vLLM Ascend Plugin versions"
 msgstr "vLLM Ascend 插件版本"
 #: ../../community/versioning_policy.md:7
 msgid ""
 "Each vLLM Ascend release will be versioned: "
 "`v[major].[minor].[micro][rcN][.postN]` (such as `v0.7.3rc1`, `v0.7.3`, "
 "`v0.7.3.post1`)"
 msgstr ""
 "每个 vLLM Ascend 版本将采用以下版本格式：`v[major].[minor].[micro][rcN][.postN]`（例如 "
 "`v0.7.3rc1`、`v0.7.3`、`v0.7.3.post1`）"
 #: ../../community/versioning_policy.md:10
 msgid ""
 "**Final releases**: will typically be released every **3 months**, will take"
 " the vLLM upstream release plan and Ascend software product release plan "
 "into comprehensive consideration."
 msgstr "**正式版本**：通常每**3个月**发布一次，将综合考虑 vLLM 上游发行计划和昇腾软件产品发行计划。"
 #: ../../community/versioning_policy.md:11
 msgid ""
 "**Pre releases**: will typically be released **on demand**, ending with rcN,"
 " represents the Nth release candidate version, to support early testing by "
 "our users prior to a final release."
 msgstr "**预发布版本**：通常会**按需发布**，以 rcN 结尾，表示第N个候选发布版本，旨在支持用户在正式发布前进行早期测试。"
 #: ../../community/versioning_policy.md:12
 msgid ""
 "**Post releases**: will typically be released **on demand** to support to "
 "address minor errors in a final release. It's different from [PEP-440 post "
 "release note](https://peps.python.org/pep-0440/#post-releases) suggestion, "
 "it will contain actual bug fixes considering that the final release version "
 "should be matched strictly with the vLLM final release version "
 "(`v[major].[minor].[micro]`). The post version has to be published as a "
 "patch version of the final release."
 msgstr ""
 "**后续版本**：通常会根据需要发布，以支持解决正式发布中的小错误。这与 [PEP-440 "
 "的后续版本说明](https://peps.python.org/pep-0440/#post-releases) 建议不同，它将包含实际的 bug "
 "修复，因为最终发布版本应严格与 vLLM "
 "的最终发布版本（`v[major].[minor].[micro]`）匹配。后续版本必须以正式发布的补丁版本形式发布。"
 #: ../../community/versioning_policy.md:14
 msgid "For example:"
 msgstr "例如："
 #: ../../community/versioning_policy.md:15
 msgid ""
 "`v0.7.x`: it's the first final release to match the vLLM `v0.7.x` version."
 msgstr "`v0.7.x`：这是第一个与 vLLM `v0.7.x` 版本相匹配的正式发布版本。"
 #: ../../community/versioning_policy.md:16
 msgid "`v0.7.3rc1`: will be the first pre version of vLLM Ascend."
 msgstr "`v0.7.3rc1`：将会是 vLLM Ascend 的第一个预发布版本。"
 #: ../../community/versioning_policy.md:17
 msgid ""
 "`v0.7.3.post1`: will be the post release if the `v0.7.3` release has some "
 "minor errors."
 msgstr "`v0.7.3.post1`：如果 `v0.7.3` 版本发布有一些小错误，将作为后续修正版发布。"
 #: ../../community/versioning_policy.md:19
 msgid "Release Compatibility Matrix"
 msgstr "版本兼容性矩阵"
 #: ../../community/versioning_policy.md:21
 msgid "Following is the Release Compatibility Matrix for vLLM Ascend Plugin:"
 msgstr "以下是 vLLM Ascend 插件的版本兼容性矩阵："
 #: ../../community/versioning_policy.md
 msgid "vLLM Ascend"
 msgstr "vLLM Ascend"
 #: ../../community/versioning_policy.md
 msgid "vLLM"
 msgstr "vLLM"
 #: ../../community/versioning_policy.md
 msgid "Python"
 msgstr "Python"
 #: ../../community/versioning_policy.md
 msgid "Stable CANN"
 msgstr "Stable CANN"
 #: ../../community/versioning_policy.md
 msgid "PyTorch/torch_npu"
 msgstr "PyTorch/torch_npu"
 #: ../../community/versioning_policy.md
 msgid "MindIE Turbo"
 msgstr "MindIE Turbo"
 #: ../../community/versioning_policy.md
 msgid "v0.9.2rc1"
 msgstr "v0.9.2rc1"
 #: ../../community/versioning_policy.md
 msgid "v0.9.2"
 msgstr "v0.9.2"
 #: ../../community/versioning_policy.md
 msgid ">= 3.9, < 3.12"
 msgstr ">= 3.9，< 3.12"
 #: ../../community/versioning_policy.md
 msgid "8.1.RC1"
 msgstr "8.1.RC1"
 #: ../../community/versioning_policy.md
 msgid "2.5.1 / 2.5.1.post1.dev20250619"
 msgstr "2.5.1 / 2.5.1.post1.dev20250619"
 #: ../../community/versioning_policy.md
 msgid "v0.9.1rc1"
 msgstr "v0.9.1rc1"
 #: ../../community/versioning_policy.md
 msgid "v0.9.1"
 msgstr "v0.9.1"
 #: ../../community/versioning_policy.md
 msgid "2.5.1 / 2.5.1.post1.dev20250528"
 msgstr "2.5.1 / 2.5.1.post1.dev20250528"
 #: ../../community/versioning_policy.md
 msgid "v0.9.0rc2"
 msgstr "v0.9.0rc2"
 #: ../../community/versioning_policy.md
 msgid "v0.9.0"
 msgstr "v0.9.0"
 #: ../../community/versioning_policy.md
 msgid "2.5.1 / 2.5.1"
 msgstr "2.5.1 / 2.5.1"
 #: ../../community/versioning_policy.md
 msgid "v0.9.0rc1"
 msgstr "v0.9.0rc1"
 #: ../../community/versioning_policy.md
 msgid "v0.8.5rc1"
 msgstr "v0.8.5rc1"
 #: ../../community/versioning_policy.md
 msgid "v0.8.5.post1"
 msgstr "v0.8.5.post1"
 #: ../../community/versioning_policy.md
 msgid "v0.8.4rc2"
 msgstr "v0.8.4rc2"
 #: ../../community/versioning_policy.md
 msgid "v0.8.4"
 msgstr "v0.8.4"
 #: ../../community/versioning_policy.md
 msgid "8.0.0"
 msgstr "8.0.0"
 #: ../../community/versioning_policy.md
 msgid "v0.7.3.post1"
 msgstr "v0.7.3.post1"
 #: ../../community/versioning_policy.md
 msgid "v0.7.3"
 msgstr "v0.7.3"
 #: ../../community/versioning_policy.md
 msgid "2.0rc1"
 msgstr "2.0候选版本1"
 #: ../../community/versioning_policy.md:34
 msgid "Release cadence"
 msgstr "发布节奏"
 #: ../../community/versioning_policy.md:36
 msgid "release window"
 msgstr "发布窗口"
 #: ../../community/versioning_policy.md
 msgid "Date"
 msgstr "日期"
 #: ../../community/versioning_policy.md
 msgid "Event"
 msgstr "事件"
 #: ../../community/versioning_policy.md
 msgid "2025.07.11"
 msgstr "2025.07.11"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.9.2rc1"
 msgstr "候选发布版本，v0.9.2rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.06.22"
 msgstr "2025.06.22"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.9.1rc1"
 msgstr "候选发布版本，v0.9.1rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.06.10"
 msgstr "2025.06.10"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.9.0rc2"
 msgstr "候选发布版本，v0.9.0rc2"
 #: ../../community/versioning_policy.md
 msgid "2025.06.09"
 msgstr "2025.06.09"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.9.0rc1"
 msgstr "候选发布版本本，v0.9.0rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.05.29"
 msgstr "2025.05.29"
 #: ../../community/versioning_policy.md
 msgid "v0.7.x post release, v0.7.3.post1"
 msgstr "v0.7.x 补丁版，v0.7.3.post1"
 #: ../../community/versioning_policy.md
 msgid "2025.05.08"
 msgstr "2025.05.08"
 #: ../../community/versioning_policy.md
 msgid "v0.7.x Final release, v0.7.3"
 msgstr "v0.7.x 正式版，v0.7.3"
 #: ../../community/versioning_policy.md
 msgid "2025.05.06"
 msgstr "2025.05.06"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.8.5rc1"
 msgstr "候选发布版本，v0.8.5rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.04.28"
 msgstr "2025.04.28"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.8.4rc2"
 msgstr "候选发布版本，v0.8.4rc2"
 #: ../../community/versioning_policy.md
 msgid "2025.04.18"
 msgstr "2025.04.18"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.8.4rc1"
 msgstr "候选发布版本，v0.8.4rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.03.28"
 msgstr "2025.03.28"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.7.3rc2"
 msgstr "候选发布版本，v0.7.3rc2"
 #: ../../community/versioning_policy.md
 msgid "2025.03.14"
 msgstr "2025.03.14"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.7.3rc1"
 msgstr "候选发布版本，v0.7.3rc1"
 #: ../../community/versioning_policy.md
 msgid "2025.02.19"
 msgstr "2025.02.19"
 #: ../../community/versioning_policy.md
 msgid "Release candidates, v0.7.1rc1"
 msgstr "候选发布版本，v0.7.1rc1"
 #: ../../community/versioning_policy.md:53
 msgid "Branch policy"
 msgstr "分支策略"
 #: ../../community/versioning_policy.md:55
 msgid "vLLM Ascend has main branch and dev branch."
 msgstr "vLLM Ascend 有主分支和开发分支。"
 #: ../../community/versioning_policy.md:57
 msgid ""
 "**main**: main branch，corresponds to the vLLM main branch and latest 1 or 2 "
 "release version. It is continuously monitored for quality through Ascend CI."
 msgstr "**main**：main 分支，对应 vLLM 的主分支和最新的 1 或 2 个发布版本。该分支通过 Ascend CI 持续监控质量。"
 #: ../../community/versioning_policy.md:58
 msgid ""
 "**vX.Y.Z-dev**: development branch, created with part of new releases of "
 "vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version."
 msgstr ""
 "**vX.Y.Z-dev**：开发分支，是随着 vLLM 新版本的一部分一起创建的。例如，`v0.7.3-dev` 是 vLLM `v0.7.3` "
 "版本的开发分支。"
 #: ../../community/versioning_policy.md:60
 msgid ""
 "Usually, a commit should be ONLY first merged in the main branch, and then "
 "backported to the dev branch to reduce maintenance costs as much as "
 "possible."
 msgstr "通常，提交应该只先合并到主分支，然后再回溯合并到开发分支，以尽可能降低维护成本。"
 #: ../../community/versioning_policy.md:62
 msgid "Maintenance branch and EOL:"
 msgstr "维护分支与生命周期结束（EOL）："
 #: ../../community/versioning_policy.md:63
 msgid "The branch status will be in one of the following states:"
 msgstr "分支状态将处于以下几种状态之一："
 #: ../../community/versioning_policy.md
 msgid "Branch"
 msgstr "分支"
 #: ../../community/versioning_policy.md
 msgid "Time frame"
 msgstr "时间范围"
 #: ../../community/versioning_policy.md
 msgid "Summary"
 msgstr "摘要"
 #: ../../community/versioning_policy.md
 msgid "Maintained"
 msgstr "维护中"
 #: ../../community/versioning_policy.md
 msgid "Approximately 2-3 minor versions"
 msgstr "大约 2-3 个小版本"
 #: ../../community/versioning_policy.md
 msgid "All bugfixes are appropriate. Releases produced, CI commitment."
 msgstr "所有的错误修复都是合适的。正常发布版本，持续集成承诺。"
 #: ../../community/versioning_policy.md
 msgid "Unmaintained"
 msgstr "无人维护"
 #: ../../community/versioning_policy.md
 msgid "Community interest driven"
 msgstr "社区兴趣驱动"
 #: ../../community/versioning_policy.md
 msgid "All bugfixes are appropriate. No Releases produced, No CI commitment"
 msgstr "所有的 bug 修复都是合适的。没有发布版本，不承诺持续集成（CI）。"
 #: ../../community/versioning_policy.md
 msgid "End of Life (EOL)"
 msgstr "生命周期结束（EOL）"
 #: ../../community/versioning_policy.md
 msgid "N/A"
 msgstr "不适用"
 #: ../../community/versioning_policy.md
 msgid "Branch no longer accepting changes"
 msgstr "该分支不再接受更改"
 #: ../../community/versioning_policy.md:71
 msgid "Branch state"
 msgstr "分支状态"
 #: ../../community/versioning_policy.md:73
 msgid ""
 "Note that vLLM Ascend will only be released for a certain vLLM release "
 "version rather than all versions. Hence, You might see only part of versions"
 " have dev branches (such as only `0.7.1-dev` / `0.7.3-dev` but no "
 "`0.7.2-dev`), this is as expected."
 msgstr ""
 "请注意，vLLM Ascend 只会针对某些 vLLM 发布版本发布，而不是所有版本。因此，您可能会看到只有部分版本拥有开发分支（例如只有 "
 "`0.7.1-dev` / `0.7.3-dev`，而没有 `0.7.2-dev`），这是正常现象。"
 #: ../../community/versioning_policy.md:75
 msgid ""
 "Usually, each minor version of vLLM (such as 0.7) will correspond to a vLLM "
 "Ascend version branch and support its latest version (for example, we plan "
 "to support version 0.7.3) as following shown:"
 msgstr ""
 "通常，vLLM 的每一个小版本（例如 0.7）都会对应一个 vLLM Ascend 版本分支，并支持其最新版本（例如，我们计划支持 0.7.3 "
 "版），如下所示："
 #: ../../community/versioning_policy.md
 msgid "Status"
 msgstr "状态"
 #: ../../community/versioning_policy.md
 msgid "Note"
 msgstr "注释"
 #: ../../community/versioning_policy.md
 msgid "main"
 msgstr "main"
 #: ../../community/versioning_policy.md
 msgid "CI commitment for vLLM main branch and vLLM 0.9.2 branch"
 msgstr "vLLM 主分支和 vLLM 0.9.2 分支的 CI 承诺"
 #: ../../community/versioning_policy.md
 msgid "v0.9.1-dev"
 msgstr "v0.9.1-dev"
 #: ../../community/versioning_policy.md
 msgid "CI commitment for vLLM 0.9.1 version"
 msgstr "vLLM 0.9.1 版本的 CI 承诺"
 #: ../../community/versioning_policy.md
 msgid "v0.7.3-dev"
 msgstr "v0.7.3-dev"
 #: ../../community/versioning_policy.md
 msgid "CI commitment for vLLM 0.7.3 version"
 msgstr "vLLM 0.7.3 版本的 CI 承诺"
 #: ../../community/versioning_policy.md
 msgid "v0.7.1-dev"
 msgstr "v0.7.1-dev"
 #: ../../community/versioning_policy.md
 msgid "Replaced by v0.7.3-dev"
 msgstr "已被 v0.7.3-dev 替代"
 #: ../../community/versioning_policy.md:84
 msgid "Backward compatibility"
 msgstr "向后兼容性"
 #: ../../community/versioning_policy.md:86
 msgid ""
 "For main branch, vLLM Ascend should works with vLLM main branch and latest 1"
 " or 2 release version. So to ensure the backward compatibility, we will do "
 "the following:"
 msgstr ""
 "对于主分支，vLLM Ascend 应该与 vLLM 主分支以及最新的 1 或 2 个发布版本兼容。因此，为了确保向后兼容性，我们将执行以下操作："
 #: ../../community/versioning_policy.md:87
 msgid ""
 "Both main branch and target vLLM release is tested by Ascend E2E CI. For "
 "example, currently, vLLM main branch and vLLM 0.8.4 are tested now."
 msgstr "主分支和目标 vLLM 发行版都经过了 Ascend E2E CI 的测试。例如，目前正在测试 vLLM 主分支和 vLLM 0.8.4。"
 #: ../../community/versioning_policy.md:88
 msgid ""
 "For code changes, we will make sure that the changes are compatible with the"
 " latest 1 or 2 vLLM release version as well. In this case, vLLM Ascend "
 "introduced a version check machinism inner the code. It'll check the version"
 " of installed vLLM package first to decide which code logic to use. If users"
 " hit the `InvalidVersion` error, it sometimes means that they have installed"
 " an dev/editable version of vLLM package. In this case, we provide the env "
 "variable `VLLM_VERSION` to let users specify the version of vLLM package to "
 "use."
 msgstr ""
 "对于代码更改，我们也会确保这些更改与最新的 1 或 2 个 vLLM 发行版本兼容。在这种情况下，vLLM Ascend "
 "在代码中引入了版本检查机制。它会先检查已安装的 vLLM 包的版本，然后决定使用哪段代码逻辑。如果用户遇到 `InvalidVersion` "
 "错误，这有时意味着他们安装了 dev/可编辑版本的 vLLM 包。此时，我们提供了环境变量 `VLLM_VERSION`，让用户可以指定要使用的 "
 "vLLM 包版本。"
 #: ../../community/versioning_policy.md:89
 msgid ""
 "For documentation changes, we will make sure that the changes are compatible"
 " with the latest 1 or 2 vLLM release version as well. Note should be added "
 "if there are any breaking changes."
 msgstr "对于文档更改，我们会确保这些更改也兼容于最新的1个或2个 vLLM 发布版本。如果有任何重大变更，应添加说明。"
 #: ../../community/versioning_policy.md:91
 msgid "Document Branch Policy"
 msgstr "文档分支政策"
 #: ../../community/versioning_policy.md:92
 msgid ""
 "To reduce maintenance costs, **all branch documentation content should "
 "remain consistent, and version differences can be controlled via variables "
 "in [docs/source/conf.py](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/docs/source/conf.py)**. While this is not a simple task, it"
 " is a principle we should strive to follow."
 msgstr ""
 "为了减少维护成本，**所有分支的文档内容应保持一致，版本差异可以通过 "
 "[docs/source/conf.py](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/docs/source/conf.py) 中的变量进行控制**。虽然这并非易事，但这是我们应当努力遵循的原则。"
 #: ../../community/versioning_policy.md
 msgid "Version"
 msgstr "版本"
 #: ../../community/versioning_policy.md
 msgid "Purpose"
 msgstr "用途"
 #: ../../community/versioning_policy.md
 msgid "Code Branch"
 msgstr "代码分支"
 #: ../../community/versioning_policy.md
 msgid "latest"
 msgstr "最新"
 #: ../../community/versioning_policy.md
 msgid "Doc for the latest dev branch"
 msgstr "最新开发分支的文档"
 #: ../../community/versioning_policy.md
 msgid "vX.Y.Z-dev (Will be `main` after the first final release)"
 msgstr "vX.Y.Z-dev（在第一个正式版本发布后将成为 `main`）"
 #: ../../community/versioning_policy.md
 msgid "version"
 msgstr "版本"
 #: ../../community/versioning_policy.md
 msgid "Doc for historical released versions"
 msgstr "历史版本文档"
 #: ../../community/versioning_policy.md
 msgid "Git tags, like vX.Y.Z[rcN]"
 msgstr "Git 标签，如 vX.Y.Z[rcN]"
 #: ../../community/versioning_policy.md
 msgid "stable（not yet released）"
 msgstr "稳定版（尚未发布）"
 #: ../../community/versioning_policy.md
 msgid "Doc for latest final release branch"
 msgstr "最新正式发布分支的文档"
 #: ../../community/versioning_policy.md
 msgid "Will be `vX.Y.Z-dev` after the first official release"
 msgstr "首个正式发布后将会是 `vX.Y.Z-dev`"
 #: ../../community/versioning_policy.md:100
 msgid "As shown above:"
 msgstr "如上所示："
 #: ../../community/versioning_policy.md:102
 msgid ""
 "`latest` documentation: Matches the current maintenance branch `vX.Y.Z-dev` "
 "(Will be `main` after the first final release). Continuously updated to "
 "ensure usability for the latest release."
 msgstr ""
 "`latest` 文档：匹配当前维护分支 `vX.Y.Z-dev`（在首次正式发布后将为 `main`）。持续更新，以确保适用于最新发布版本。"
 #: ../../community/versioning_policy.md:103
 msgid ""
 "`version` documentation: Corresponds to specific released versions (e.g., "
 "`v0.7.3`, `v0.7.3rc1`). No further updates after release."
 msgstr "`version` 文档：对应特定的已发布版本（例如，`v0.7.3`、`v0.7.3rc1`）。发布后不再进行更新。"
 #: ../../community/versioning_policy.md:104
 msgid ""
 "`stable` documentation (**not yet released**): Official release "
 "documentation. Updates are allowed in real-time after release, typically "
 "based on vX.Y.Z-dev. Once stable documentation is available, non-stable "
 "versions should display a header warning: `You are viewing the latest "
 "developer preview docs. Click here to view docs for the latest stable "
 "release.`."
 msgstr ""
 "`stable` 文档（**尚未发布**）：官方发布版文档。发布后允许实时更新，通常基于 "
 "vX.Y.Z-dev。一旦稳定版文档可用，非稳定版本应显示一个顶部警告：`您正在查看最新的开发预览文档。点击此处查看最新稳定版本文档。`"
 #: ../../community/versioning_policy.md:106
 msgid "Software Dependency Management"
 msgstr "软件依赖管理"
 #: ../../community/versioning_policy.md:107
 msgid ""
 "`torch-npu`: Ascend Extension for PyTorch (torch-npu) releases a stable "
 "version to [PyPi](https://pypi.org/project/torch-npu) every 3 months, a "
 "development version (aka the POC version) every month, and a nightly version"
 " every day. The PyPi stable version **CAN** be used in vLLM Ascend final "
 "version, the monthly dev version **ONLY CANN** be used in vLLM Ascend RC "
 "version for rapid iteration, the nightly version **CANNOT** be used in vLLM "
 "Ascend any version and branches."
 msgstr ""
 "`torch-npu`：Ascend Extension for PyTorch（torch-npu）每 3 个月会在 "
 "[PyPi](https://pypi.org/project/torch-npu) 上发布一个稳定版本，每个月发布一个开发版本（即 POC "
 "版本），每天发布一个 nightly 版本。PyPi 上的稳定版本**可以**用于 vLLM Ascend 的正式版本，月度开发版本**只能**用于 "
 "vLLM Ascend 的 RC（候选发布）版本以便快速迭代，nightly 版本**不能**用于 vLLM Ascend 的任何版本和分支。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/index.po
@@ -0,0 +1,187 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/contribution/index.md:107
 msgid "Index"
 msgstr "索引"
 #: ../../developer_guide/contribution/index.md:1
 msgid "Contributing"
 msgstr "贡献"
 #: ../../developer_guide/contribution/index.md:3
 msgid "Building and testing"
 msgstr "构建与测试"
 #: ../../developer_guide/contribution/index.md:4
 msgid ""
 "It's recommended to set up a local development environment to build and test"
 " before you submit a PR."
 msgstr "建议先搭建本地开发环境来进行构建和测试，再提交 PR。"
 #: ../../developer_guide/contribution/index.md:7
 msgid "Setup development environment"
 msgstr "搭建开发环境"
 #: ../../developer_guide/contribution/index.md:9
 msgid ""
 "Theoretically, the vllm-ascend build is only supported on Linux because "
 "`vllm-ascend` dependency `torch_npu` only supports Linux."
 msgstr ""
 "理论上，vllm-ascend 构建仅支持 Linux，因为 `vllm-ascend` 的依赖项 `torch_npu` 只支持 Linux。"
 #: ../../developer_guide/contribution/index.md:12
 msgid ""
 "But you can still set up dev env on Linux/Windows/macOS for linting and "
 "basic test as following commands:"
 msgstr "但你仍然可以在 Linux/Windows/macOS 上按照以下命令设置开发环境，用于代码规约检查和基本测试："
 #: ../../developer_guide/contribution/index.md:15
 msgid "Run lint locally"
 msgstr "在本地运行 lint"
 #: ../../developer_guide/contribution/index.md:33
 msgid "Run CI locally"
 msgstr "本地运行CI"
 #: ../../developer_guide/contribution/index.md:35
 msgid "After complete \"Run lint\" setup, you can run CI locally:"
 msgstr "在完成“运行 lint”设置后，你可以在本地运行 CI："
 #: ../../developer_guide/contribution/index.md:61
 msgid "Submit the commit"
 msgstr "提交该提交"
 #: ../../developer_guide/contribution/index.md:68
 msgid ""
 "🎉 Congratulations! You have completed the development environment setup."
 msgstr "🎉 恭喜！你已经完成了开发环境的搭建。"
 #: ../../developer_guide/contribution/index.md:70
 msgid "Test locally"
 msgstr "本地测试"
 #: ../../developer_guide/contribution/index.md:72
 msgid ""
 "You can refer to [Testing](./testing.md) doc to help you setup testing "
 "environment and running tests locally."
 msgstr "你可以参考 [测试](./testing.md) 文档，帮助你搭建测试环境并在本地运行测试。"
 #: ../../developer_guide/contribution/index.md:74
 msgid "DCO and Signed-off-by"
 msgstr "DCO 和签名确认"
 #: ../../developer_guide/contribution/index.md:76
 msgid ""
 "When contributing changes to this project, you must agree to the DCO. "
 "Commits must include a `Signed-off-by:` header which certifies agreement "
 "with the terms of the DCO."
 msgstr "当为本项目贡献更改时，您必须同意 DCO。提交必须包含 `Signed-off-by:` 头部，以证明您同意 DCO 的条款。"
 #: ../../developer_guide/contribution/index.md:78
 msgid "Using `-s` with `git commit` will automatically add this header."
 msgstr "在使用 `git commit` 时加上 `-s` 参数会自动添加这个头部信息。"
 #: ../../developer_guide/contribution/index.md:80
 msgid "PR Title and Classification"
 msgstr "PR 标题与分类"
 #: ../../developer_guide/contribution/index.md:82
 msgid ""
 "Only specific types of PRs will be reviewed. The PR title is prefixed "
 "appropriately to indicate the type of change. Please use one of the "
 "following:"
 msgstr "只有特定类型的 PR 会被审核。PR 标题应使用合适的前缀以指明更改类型。请使用以下之一："
 #: ../../developer_guide/contribution/index.md:84
 msgid "`[Attention]` for new features or optimization in attention."
 msgstr "`[Attention]` 用于注意力机制中新特性或优化。"
 #: ../../developer_guide/contribution/index.md:85
 msgid "`[Communicator]` for new features or optimization in communicators."
 msgstr "`[Communicator]` 适用于通信器中的新特性或优化。"
 #: ../../developer_guide/contribution/index.md:86
 msgid "`[ModelRunner]` for new features or optimization in model runner."
 msgstr "`[ModelRunner]` 用于模型运行器中的新功能或优化。"
 #: ../../developer_guide/contribution/index.md:87
 msgid "`[Platform]` for new features or optimization in platform."
 msgstr "`[Platform]` 用于平台中新功能或优化。"
 #: ../../developer_guide/contribution/index.md:88
 msgid "`[Worker]` for new features or optimization in worker."
 msgstr "`[Worker]` 用于 worker 的新功能或优化。"
 #: ../../developer_guide/contribution/index.md:89
 msgid ""
 "`[Core]` for new features or optimization  in the core vllm-ascend logic "
 "(such as platform, attention, communicators, model runner)"
 msgstr "`[Core]` 用于核心 vllm-ascend 逻辑中的新特性或优化（例如平台、注意力机制、通信器、模型运行器）。"
 #: ../../developer_guide/contribution/index.md:90
 msgid "`[Kernel]` changes affecting compute kernels and ops."
 msgstr "`[Kernel]` 影响计算内核和操作的更改。"
 #: ../../developer_guide/contribution/index.md:91
 msgid "`[Bugfix]` for bug fixes."
 msgstr "`[Bugfix]` 用于表示错误修复。"
 #: ../../developer_guide/contribution/index.md:92
 msgid "`[Doc]` for documentation fixes and improvements."
 msgstr "`[Doc]` 用于文档修复和改进。"
 #: ../../developer_guide/contribution/index.md:93
 msgid "`[Test]` for tests (such as unit tests)."
 msgstr "`[Test]` 用于测试（如单元测试）。"
 #: ../../developer_guide/contribution/index.md:94
 msgid "`[CI]` for build or continuous integration improvements."
 msgstr "`[CI]` 用于构建或持续集成的改进。"
 #: ../../developer_guide/contribution/index.md:95
 msgid ""
 "`[Misc]` for PRs that do not fit the above categories. Please use this "
 "sparingly."
 msgstr "对于不属于上述类别的 PR，请使用 `[Misc]`。请谨慎使用此标签。"
 #: ../../developer_guide/contribution/index.md:98
 msgid ""
 "If the PR spans more than one category, please include all relevant "
 "prefixes."
 msgstr "如果拉取请求（PR）涵盖多个类别，请包含所有相关的前缀。"
 #: ../../developer_guide/contribution/index.md:101
 msgid "Others"
 msgstr "其他"
 #: ../../developer_guide/contribution/index.md:103
 msgid ""
 "You may find more information about contributing to vLLM Ascend backend "
 "plugin on "
 "[<u>docs.vllm.ai</u>](https://docs.vllm.ai/en/latest/contributing/overview.html)."
 " If you find any problem when contributing, you can feel free to submit a PR"
 " to improve the doc to help other developers."
 msgstr ""
 "你可以在 "
 "[<u>docs.vllm.ai</u>](https://docs.vllm.ai/en/latest/contributing/overview.html)"
 " 上找到有关为 vLLM Ascend 后端插件做贡献的更多信息。如果你在贡献过程中遇到任何问题，欢迎随时提交 PR 来改进文档，以帮助其他开发者。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/contribution/testing.po
@@ -0,0 +1,237 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/contribution/testing.md:1
 msgid "Testing"
 msgstr "测试"
 #: ../../developer_guide/contribution/testing.md:3
 msgid ""
 "This secition explains how to write e2e tests and unit tests to verify the "
 "implementation of your feature."
 msgstr "本节介绍如何编写端到端测试和单元测试，以验证你的功能实现。"
 #: ../../developer_guide/contribution/testing.md:5
 msgid "Setup test environment"
 msgstr "设置测试环境"
 #: ../../developer_guide/contribution/testing.md:7
 msgid ""
 "The fastest way to setup test environment is to use the main branch "
 "container image:"
 msgstr "搭建测试环境最快的方法是使用 main 分支的容器镜像："
 #: ../../developer_guide/contribution/testing.md
 msgid "Local (CPU)"
 msgstr "本地（CPU）"
 #: ../../developer_guide/contribution/testing.md:18
 msgid "You can run the unit tests on CPU with the following steps:"
 msgstr "你可以按照以下步骤在 CPU 上运行单元测试："
 #: ../../developer_guide/contribution/testing.md
 msgid "Single card"
 msgstr "单张卡片"
 #: ../../developer_guide/contribution/testing.md:85
 #: ../../developer_guide/contribution/testing.md:123
 msgid ""
 "After starting the container, you should install the required packages:"
 msgstr "启动容器后，你应该安装所需的软件包："
 #: ../../developer_guide/contribution/testing.md
 msgid "Multi cards"
 msgstr "多卡"
 #: ../../developer_guide/contribution/testing.md:137
 msgid "Running tests"
 msgstr "运行测试"
 #: ../../developer_guide/contribution/testing.md:139
 msgid "Unit test"
 msgstr "单元测试"
 #: ../../developer_guide/contribution/testing.md:141
 msgid "There are several principles to follow when writing unit tests:"
 msgstr "编写单元测试时需要遵循几个原则："
 #: ../../developer_guide/contribution/testing.md:143
 msgid ""
 "The test file path should be consistent with source file and start with "
 "`test_` prefix, such as: `vllm_ascend/worker/worker_v1.py` --> "
 "`tests/ut/worker/test_worker_v1.py`"
 msgstr ""
 "测试文件的路径应与源文件保持一致，并以 `test_` 前缀开头，例如：`vllm_ascend/worker/worker_v1.py` --> "
 "`tests/ut/worker/test_worker_v1.py`"
 #: ../../developer_guide/contribution/testing.md:144
 msgid ""
 "The vLLM Ascend test are using unittest framework, see "
 "[here](https://docs.python.org/3/library/unittest.html#module-unittest) to "
 "understand how to write unit tests."
 msgstr ""
 "vLLM Ascend 测试使用 unittest "
 "框架，参见[这里](https://docs.python.org/3/library/unittest.html#module-"
 "unittest)了解如何编写单元测试。"
 #: ../../developer_guide/contribution/testing.md:145
 msgid ""
 "All unit tests can be run on CPU, so you must mock the device-related "
 "function to host."
 msgstr "所有单元测试都可以在 CPU 上运行，因此你必须将与设备相关的函数模拟为 host。"
 #: ../../developer_guide/contribution/testing.md:146
 msgid ""
 "Example: [tests/ut/test_ascend_config.py](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/tests/ut/test_ascend_config.py)."
 msgstr ""
 "示例：[tests/ut/test_ascend_config.py](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/tests/ut/test_ascend_config.py)。"
 #: ../../developer_guide/contribution/testing.md:147
 msgid "You can run the unit tests using `pytest`:"
 msgstr "你可以使用 `pytest` 运行单元测试："
 #: ../../developer_guide/contribution/testing.md
 msgid "Multi cards test"
 msgstr "多卡测试"
 #: ../../developer_guide/contribution/testing.md:192
 msgid "E2E test"
 msgstr "端到端测试"
 #: ../../developer_guide/contribution/testing.md:194
 msgid ""
 "Although vllm-ascend CI provide [e2e test](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_test.yaml) on "
 "Ascend CI, you can run it locally."
 msgstr ""
 "虽然 vllm-ascend CI 在 Ascend CI 上提供了 [端到端测试](https://github.com/vllm-"
 "project/vllm-"
 "ascend/blob/main/.github/workflows/vllm_ascend_test.yaml)，你也可以在本地运行它。"
 #: ../../developer_guide/contribution/testing.md:204
 msgid "You can't run e2e test on CPU."
 msgstr "你无法在 CPU 上运行 e2e 测试。"
 #: ../../developer_guide/contribution/testing.md:240
 msgid ""
 "This will reproduce e2e test: "
 "[vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/.github/workflows/vllm_ascend_test.yaml)."
 msgstr ""
 "这将复现端到端测试：[vllm_ascend_test.yaml](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/.github/workflows/vllm_ascend_test.yaml)。"
 #: ../../developer_guide/contribution/testing.md:242
 msgid "E2E test example:"
 msgstr "E2E 测试示例："
 #: ../../developer_guide/contribution/testing.md:244
 msgid ""
 "Offline test example: "
 "[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-"
 "project/vllm-"
 "ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)"
 msgstr ""
 "离线测试示例：[`tests/e2e/singlecard/test_offline_inference.py`](https://github.com/vllm-"
 "project/vllm-"
 "ascend/blob/main/tests/e2e/singlecard/test_offline_inference.py)"
 #: ../../developer_guide/contribution/testing.md:245
 msgid ""
 "Online test examples: "
 "[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)"
 msgstr ""
 "在线测试示例：[`tests/e2e/singlecard/test_prompt_embedding.py`](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_prompt_embedding.py)"
 #: ../../developer_guide/contribution/testing.md:246
 msgid ""
 "Correctness test example: "
 "[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
 msgstr ""
 "正确性测试示例：[`tests/e2e/singlecard/test_aclgraph.py`](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/tests/e2e/singlecard/test_aclgraph.py)"
 #: ../../developer_guide/contribution/testing.md:247
 msgid ""
 "Reduced Layer model test example: [test_torchair_graph_mode.py - "
 "DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-"
 "ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)"
 msgstr ""
 "简化层模型测试示例：[test_torchair_graph_mode.py - "
 "DeepSeek-V3-Pruning](https://github.com/vllm-project/vllm-"
 "ascend/blob/20767a043cccb3764214930d4695e53941de87ec/tests/e2e/multicard/test_torchair_graph_mode.py#L48)"
 #: ../../developer_guide/contribution/testing.md:249
 msgid ""
 "The CI resource is limited, you might need to reduce layer number of the "
 "model, below is an example of how to generate a reduced layer model:"
 msgstr "CI 资源有限，您可能需要减少模型的层数，下面是一个生成减少层数模型的示例："
 #: ../../developer_guide/contribution/testing.md:250
 msgid ""
 "Fork the original model repo in modelscope, we need all the files in the "
 "repo except for weights."
 msgstr "在 modelscope 中 fork 原始模型仓库，我们需要仓库中的所有文件，除了权重文件。"
 #: ../../developer_guide/contribution/testing.md:251
 #, python-brace-format
 msgid ""
 "Set `num_hidden_layers` to the expected number of layers, e.g., "
 "`{\"num_hidden_layers\": 2,}`"
 msgstr "将 `num_hidden_layers` 设置为期望的层数，例如 `{\"num_hidden_layers\": 2,}`"
 #: ../../developer_guide/contribution/testing.md:252
 msgid ""
 "Copy the following python script as `generate_random_weight.py`. Set the "
 "relevant parameters `MODEL_LOCAL_PATH`, `DIST_DTYPE` and `DIST_MODEL_PATH` "
 "as needed:"
 msgstr ""
 "将以下 Python 脚本复制为 `generate_random_weight.py`。根据需要设置相关参数 "
 "`MODEL_LOCAL_PATH`、`DIST_DTYPE` 和 `DIST_MODEL_PATH`："
 #: ../../developer_guide/contribution/testing.md:270
 msgid "Run doctest"
 msgstr "运行 doctest"
 #: ../../developer_guide/contribution/testing.md:272
 msgid ""
 "vllm-ascend provides a `vllm-ascend/tests/e2e/run_doctests.sh` command to "
 "run all doctests in the doc files. The doctest is a good way to make sure "
 "the docs are up to date and the examples are executable, you can run it "
 "locally as follows:"
 msgstr ""
 "vllm-ascend 提供了一个 `vllm-ascend/tests/e2e/run_doctests.sh` 命令，用于运行文档文件中的所有 "
 "doctest。doctest 是确保文档保持最新且示例可执行的好方法，你可以按照以下方式在本地运行它："
 #: ../../developer_guide/contribution/testing.md:280
 msgid ""
 "This will reproduce the same environment as the CI: "
 "[vllm_ascend_doctest.yaml](https://github.com/vllm-project/vllm-"
 "ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml)."
 msgstr ""
 "这将复现与 CI 相同的环境：[vllm_ascend_doctest.yaml](https://github.com/vllm-"
 "project/vllm-ascend/blob/main/.github/workflows/vllm_ascend_doctest.yaml)。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/accuracy_report/index.po
@@ -0,0 +1,26 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/evaluation/accuracy_report/index.md:1
 #: ../../developer_guide/evaluation/accuracy_report/index.md:3
 msgid "Accuracy Report"
 msgstr "准确性报告"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/index.po
@@ -0,0 +1,26 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/evaluation/index.md:1
 #: ../../developer_guide/evaluation/index.md:3
 msgid "Accuracy"
 msgstr "准确性"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_evalscope.po
@@ -0,0 +1,112 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/evaluation/using_evalscope.md:1
 msgid "Using EvalScope"
 msgstr "使用 EvalScope"
 #: ../../developer_guide/evaluation/using_evalscope.md:3
 msgid ""
 "This document will guide you have model inference stress testing and "
 "accuracy testing using [EvalScope](https://github.com/modelscope/evalscope)."
 msgstr ""
 "本文档将指导您如何使用 [EvalScope](https://github.com/modelscope/evalscope) "
 "进行模型推理压力测试和精度测试。"
 #: ../../developer_guide/evaluation/using_evalscope.md:5
 msgid "1. Online serving"
 msgstr "1. 在线服务"
 #: ../../developer_guide/evaluation/using_evalscope.md:7
 msgid "You can run docker container to start the vLLM server on a single NPU:"
 msgstr "你可以运行 docker 容器，在单个 NPU 上启动 vLLM 服务器："
 #: ../../developer_guide/evaluation/using_evalscope.md:34
 msgid "If your service start successfully, you can see the info shown below:"
 msgstr "如果你的服务启动成功，你会看到如下所示的信息："
 #: ../../developer_guide/evaluation/using_evalscope.md:42
 msgid ""
 "Once your server is started, you can query the model with input prompts in "
 "new terminal:"
 msgstr "一旦你的服务器启动后，你可以在新的终端中用输入提示词查询模型："
 #: ../../developer_guide/evaluation/using_evalscope.md:55
 msgid "2. Install EvalScope using pip"
 msgstr "2. 使用 pip 安装 EvalScope"
 #: ../../developer_guide/evaluation/using_evalscope.md:57
 msgid "You can install EvalScope by using:"
 msgstr "你可以使用以下方式安装 EvalScope："
 #: ../../developer_guide/evaluation/using_evalscope.md:65
 msgid "3. Run gsm8k accuracy test using EvalScope"
 msgstr "3. 使用 EvalScope 运行 gsm8k 准确率测试"
 #: ../../developer_guide/evaluation/using_evalscope.md:67
 msgid "You can `evalscope eval` run gsm8k accuracy test:"
 msgstr "你可以使用 `evalscope eval` 运行 gsm8k 准确率测试："
 #: ../../developer_guide/evaluation/using_evalscope.md:78
 #: ../../developer_guide/evaluation/using_evalscope.md:114
 msgid "After 1-2 mins, the output is as shown below:"
 msgstr "1-2 分钟后，输出如下所示："
 #: ../../developer_guide/evaluation/using_evalscope.md:88
 msgid ""
 "See more detail in: [EvalScope doc - Model API Service "
 "Evaluation](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-"
 "api-service-evaluation)."
 msgstr ""
 "更多详情请见：[EvalScope 文档 - 模型 API "
 "服务评测](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#model-"
 "api-service-evaluation)。"
 #: ../../developer_guide/evaluation/using_evalscope.md:90
 msgid "4. Run model inference stress testing using EvalScope"
 msgstr "4. 使用 EvalScope 运行模型推理压力测试"
 #: ../../developer_guide/evaluation/using_evalscope.md:92
 msgid "Install EvalScope[perf] using pip"
 msgstr "使用 pip 安装 EvalScope[perf]"
 #: ../../developer_guide/evaluation/using_evalscope.md:98
 msgid "Basic usage"
 msgstr "基本用法"
 #: ../../developer_guide/evaluation/using_evalscope.md:100
 msgid "You can use `evalscope perf` run perf test:"
 msgstr "你可以使用 `evalscope perf` 运行性能测试："
 #: ../../developer_guide/evaluation/using_evalscope.md:112
 msgid "Output results"
 msgstr "输出结果"
 #: ../../developer_guide/evaluation/using_evalscope.md:173
 msgid ""
 "See more detail in: [EvalScope doc - Model Inference Stress "
 "Testing](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-"
 "usage)."
 msgstr ""
 "更多详情见：[EvalScope 文档 - "
 "模型推理压力测试](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#basic-"
 "usage)。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_lm_eval.po
@@ -0,0 +1,65 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/evaluation/using_lm_eval.md:1
 msgid "Using lm-eval"
 msgstr "使用 lm-eval"
 #: ../../developer_guide/evaluation/using_lm_eval.md:2
 msgid ""
 "This document will guide you have a accuracy testing using [lm-"
 "eval](https://github.com/EleutherAI/lm-evaluation-harness)."
 msgstr ""
 "本文将指导你如何使用 [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness) "
 "进行准确率测试。"
 #: ../../developer_guide/evaluation/using_lm_eval.md:4
 msgid "1. Run docker container"
 msgstr "1. 运行 docker 容器"
 #: ../../developer_guide/evaluation/using_lm_eval.md:6
 msgid "You can run docker container on a single NPU:"
 msgstr "你可以在单个NPU上运行docker容器："
 #: ../../developer_guide/evaluation/using_lm_eval.md:33
 msgid "2. Run ceval accuracy test using lm-eval"
 msgstr "2. 使用 lm-eval 运行 ceval 准确性测试"
 #: ../../developer_guide/evaluation/using_lm_eval.md:34
 msgid "Install lm-eval in the container."
 msgstr "在容器中安装 lm-eval。"
 #: ../../developer_guide/evaluation/using_lm_eval.md:39
 msgid "Run the following command:"
 msgstr "运行以下命令："
 #: ../../developer_guide/evaluation/using_lm_eval.md:50
 msgid "After 1-2 mins, the output is as shown below:"
 msgstr "1-2 分钟后，输出如下所示："
 #: ../../developer_guide/evaluation/using_lm_eval.md:62
 msgid ""
 "You can see more usage on [Lm-eval Docs](https://github.com/EleutherAI/lm-"
 "evaluation-harness/blob/main/docs/README.md)."
 msgstr ""
 "你可以在 [Lm-eval 文档](https://github.com/EleutherAI/lm-evaluation-"
 "harness/blob/main/docs/README.md) 上查看更多用法。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/evaluation/using_opencompass.po
@@ -0,0 +1,83 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/evaluation/using_opencompass.md:1
 msgid "Using OpenCompass"
 msgstr "使用 OpenCompass"
 #: ../../developer_guide/evaluation/using_opencompass.md:2
 msgid ""
 "This document will guide you have a accuracy testing using "
 "[OpenCompass](https://github.com/open-compass/opencompass)."
 msgstr ""
 "本文档将指导你如何使用 [OpenCompass](https://github.com/open-compass/opencompass) "
 "进行准确率测试。"
 #: ../../developer_guide/evaluation/using_opencompass.md:4
 msgid "1. Online Serving"
 msgstr "1. 在线服务"
 #: ../../developer_guide/evaluation/using_opencompass.md:6
 msgid "You can run docker container to start the vLLM server on a single NPU:"
 msgstr "你可以运行 docker 容器，在单个 NPU 上启动 vLLM 服务器："
 #: ../../developer_guide/evaluation/using_opencompass.md:32
 msgid "If your service start successfully, you can see the info shown below:"
 msgstr "如果你的服务启动成功，你会看到如下所示的信息："
 #: ../../developer_guide/evaluation/using_opencompass.md:39
 msgid ""
 "Once your server is started, you can query the model with input prompts in "
 "new terminal:"
 msgstr "一旦你的服务器启动后，你可以在新的终端中用输入提示词查询模型："
 #: ../../developer_guide/evaluation/using_opencompass.md:51
 msgid "2. Run ceval accuracy test using OpenCompass"
 msgstr "2. 使用 OpenCompass 运行 ceval 准确率测试"
 #: ../../developer_guide/evaluation/using_opencompass.md:52
 msgid ""
 "Install OpenCompass and configure the environment variables in the "
 "container."
 msgstr "在容器中安装 OpenCompass 并配置环境变量。"
 #: ../../developer_guide/evaluation/using_opencompass.md:64
 msgid ""
 "Add `opencompass/configs/eval_vllm_ascend_demo.py` with the following "
 "content:"
 msgstr "添加 `opencompass/configs/eval_vllm_ascend_demo.py`，内容如下："
 #: ../../developer_guide/evaluation/using_opencompass.md:104
 msgid "Run the following command:"
 msgstr "运行以下命令："
 #: ../../developer_guide/evaluation/using_opencompass.md:110
 msgid "After 1-2 mins, the output is as shown below:"
 msgstr "1-2 分钟后，输出如下所示："
 #: ../../developer_guide/evaluation/using_opencompass.md:120
 msgid ""
 "You can see more usage on [OpenCompass "
 "Docs](https://opencompass.readthedocs.io/en/latest/index.html)."
 msgstr ""
 "你可以在 [OpenCompass "
 "文档](https://opencompass.readthedocs.io/en/latest/index.html) 查看更多用法。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/index.po
@@ -0,0 +1,33 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/feature_guide/index.md:1
 #: ../../developer_guide/feature_guide/index.md:5
 msgid "Feature Guide"
 msgstr "功能指南"
 #: ../../developer_guide/feature_guide/index.md:3
 msgid ""
 "This section provides an overview of the features implemented in vLLM "
 "Ascend. Developers can refer to this guide to understand how vLLM Ascend "
 "works."
 msgstr "本节概述了 vLLM Ascend 中实现的功能。开发者可以参考本指南以了解 vLLM Ascend 的工作原理。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/feature_guide/patch.po
@@ -0,0 +1,248 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/feature_guide/patch.md:1
 msgid "Patch in vLLM Ascend"
 msgstr "在 vLLM Ascend 中的补丁"
 #: ../../developer_guide/feature_guide/patch.md:3
 msgid ""
 "vLLM Ascend is a platform plugin for vLLM. Due to the release cycle of vLLM "
 "and vLLM Ascend is different, and the hardware limitation in some case, we "
 "need to patch some code in vLLM to make it compatible with vLLM Ascend."
 msgstr ""
 "vLLM Ascend 是 vLLM 的一个平台插件。由于 vLLM 和 vLLM Ascend "
 "的发布周期不同，并且在某些情况下存在硬件限制，我们需要对 vLLM 进行一些代码补丁，以使其能够兼容 vLLM Ascend。"
 #: ../../developer_guide/feature_guide/patch.md:5
 msgid ""
 "In vLLM Ascend code, we provide a patch module `vllm_ascend/patch` to "
 "address the change for vLLM."
 msgstr "在 vLLM Ascend 代码中，我们提供了一个补丁模块 `vllm_ascend/patch` 用于应对 vLLM 的变更。"
 #: ../../developer_guide/feature_guide/patch.md:7
 msgid "Principle"
 msgstr "原理"
 #: ../../developer_guide/feature_guide/patch.md:9
 msgid ""
 "We should keep in mind that Patch is not the best way to make vLLM Ascend "
 "compatible. It's just a temporary solution. The best way is to contribute "
 "the change to vLLM to make it compatible with vLLM Ascend originally. In "
 "vLLM Ascend, we have the basic principle for Patch strategy:"
 msgstr ""
 "我们需要记住，Patch 不是让 vLLM 兼容 Ascend 的最佳方式，这只是一个临时的解决方案。最好的方法是将修改贡献到 vLLM 项目中，从而让"
 " vLLM 原生支持 Ascend。对于 vLLM Ascend，我们对 Patch 策略有一个基本原则："
 #: ../../developer_guide/feature_guide/patch.md:11
 msgid "Less is more. Please do not patch unless it's the only way currently."
 msgstr "少即是多。请不要打补丁，除非这是目前唯一的方法。"
 #: ../../developer_guide/feature_guide/patch.md:12
 msgid ""
 "Once a patch is added, it's required to describe the future plan for "
 "removing the patch."
 msgstr "一旦补丁被添加，必须说明将来移除该补丁的计划。"
 #: ../../developer_guide/feature_guide/patch.md:13
 msgid "Anytime, clean the patch code is welcome."
 msgstr "任何时候，欢迎清理补丁代码。"
 #: ../../developer_guide/feature_guide/patch.md:15
 msgid "How it works"
 msgstr "工作原理"
 #: ../../developer_guide/feature_guide/patch.md:17
 msgid "In `vllm_ascend/patch`, you can see the code structure as follows:"
 msgstr "在 `vllm_ascend/patch` 目录中，你可以看到如下代码结构："
 #: ../../developer_guide/feature_guide/patch.md:33
 msgid ""
 "**platform**: The patch code in this directory is for patching the code in "
 "vLLM main process. It's called by "
 "`vllm_ascend/platform::NPUPlatform::pre_register_and_update` very early when"
 " vLLM is initialized."
 msgstr ""
 "**platform**：此目录下的补丁代码用于修补 vLLM 主进程中的代码。当 vLLM 初始化时，会在很早的阶段由 "
 "`vllm_ascend/platform::NPUPlatform::pre_register_and_update` 调用。"
 #: ../../developer_guide/feature_guide/patch.md:34
 msgid ""
 "For online mode, vLLM process calls the platform patch here "
 "`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` when parsing "
 "the cli args."
 msgstr ""
 "对于在线模式，vLLM 进程在解析命令行参数时，会在 "
 "`vllm/vllm/engine/arg_utils.py::AsyncEngineArgs.add_cli_args` 这里调用平台补丁。"
 #: ../../developer_guide/feature_guide/patch.md:35
 msgid ""
 "For offline mode, vLLM process calls the platform patch here "
 "`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config` when "
 "parsing the input parameters."
 msgstr ""
 "对于离线模式，vLLM 进程在解析输入参数时，会在此处调用平台补丁 "
 "`vllm/vllm/engine/arg_utils.py::EngineArgs.create_engine_config`。"
 #: ../../developer_guide/feature_guide/patch.md:36
 msgid ""
 "**worker**: The patch code in this directory is for patching the code in "
 "vLLM worker process. It's called by "
 "`vllm_ascend/worker/worker_v1::NPUWorker::__init__` when the vLLM worker "
 "process is initialized."
 msgstr ""
 "**worker**：此目录中的补丁代码用于修补 vLLM worker 进程中的代码。在初始化 vLLM worker 进程时，会被 "
 "`vllm_ascend/worker/worker_v1::NPUWorker::__init__` 调用。"
 #: ../../developer_guide/feature_guide/patch.md:37
 msgid ""
 "For both online and offline mode, vLLM engine core process calls the worker "
 "patch here `vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker` "
 "when initializing the worker process."
 msgstr ""
 "无论是在线还是离线模式，vLLM 引擎核心进程在初始化 worker 进程时，都会在这里调用 worker "
 "补丁：`vllm/vllm/worker/worker_base.py::WorkerWrapperBase.init_worker`。"
 #: ../../developer_guide/feature_guide/patch.md:39
 msgid ""
 "In both **platform** and **worker** folder, there are several patch modules."
 " They are used for patching different version of vLLM."
 msgstr "在 **platform** 和 **worker** 文件夹中都有一些补丁模块。它们用于修补不同版本的 vLLM。"
 #: ../../developer_guide/feature_guide/patch.md:41
 msgid ""
 "`patch_0_9_2`: This module is used for patching vLLM 0.9.2. The version is "
 "always the nearest version of vLLM. Once vLLM is released, we will drop this"
 " patch module and bump to a new version. For example, `patch_0_9_2` is used "
 "for patching vLLM 0.9.2."
 msgstr ""
 "`patch_0_9_2`：此模块用于修补 vLLM 0.9.2。该版本始终对应于 vLLM 的最近版本。一旦 vLLM "
 "发布新版本，我们将移除此补丁模块并升级到新版本。例如，`patch_0_9_2` 就是用于修补 vLLM 0.9.2 的。"
 #: ../../developer_guide/feature_guide/patch.md:42
 msgid ""
 "`patch_main`: This module is used for patching the code in vLLM main branch."
 msgstr "`patch_main`：该模块用于修补 vLLM 主分支代码。"
 #: ../../developer_guide/feature_guide/patch.md:43
 msgid ""
 "`patch_common`: This module is used for patching both vLLM 0.9.2 and vLLM "
 "main branch."
 msgstr "`patch_common`：此模块用于同时修补 vLLM 0.9.2 版本和 vLLM 主分支。"
 #: ../../developer_guide/feature_guide/patch.md:45
 msgid "How to write a patch"
 msgstr "如何撰写补丁"
 #: ../../developer_guide/feature_guide/patch.md:47
 msgid ""
 "Before writing a patch, following the principle above, we should patch the "
 "least code. If it's necessary, we can patch the code in either **platform** "
 "and **worker** folder. Here is an example to patch `distributed` module in "
 "vLLM."
 msgstr ""
 "在编写补丁之前，遵循上述原则，我们应尽量修改最少的代码。如果有必要，我们可以修改 **platform** 和 **worker** "
 "文件夹中的代码。下面是一个在 vLLM 中修改 `distributed` 模块的示例。"
 #: ../../developer_guide/feature_guide/patch.md:49
 msgid ""
 "Decide which version of vLLM we should patch. For example, after analysis, "
 "here we want to patch both 0.9.2 and main of vLLM."
 msgstr "决定我们应该修补哪个版本的 vLLM。例如，经过分析后，这里我们想要同时修补 vLLM 的 0.9.2 版和主分支（main）。"
 #: ../../developer_guide/feature_guide/patch.md:50
 msgid ""
 "Decide which process we should patch. For example, here `distributed` "
 "belongs to the vLLM main process, so we should patch `platform`."
 msgstr "决定我们应该修补哪个进程。例如，这里 `distributed` 属于 vLLM 主进程，所以我们应该修补 `platform`。"
 #: ../../developer_guide/feature_guide/patch.md:51
 #, python-brace-format
 msgid ""
 "Create the patch file in the right folder. The file should be named as "
 "`patch_{module_name}.py`. The example here is "
 "`vllm_ascend/patch/platform/patch_common/patch_distributed.py`."
 msgstr ""
 "在正确的文件夹中创建补丁文件。文件应命名为 `patch_{module_name}.py`。此处的示例是 "
 "`vllm_ascend/patch/platform/patch_common/patch_distributed.py`。"
 #: ../../developer_guide/feature_guide/patch.md:52
 msgid "Write your patch code in the new file. Here is an example:"
 msgstr "在新文件中编写你的补丁代码。以下是一个示例："
 #: ../../developer_guide/feature_guide/patch.md:62
 msgid ""
 "Import the patch file in `__init__.py`. In this example, add `import "
 "vllm_ascend.patch.platform.patch_common.patch_distributed` into "
 "`vllm_ascend/patch/platform/patch_common/__init__.py`."
 msgstr ""
 "在 `__init__.py` 中导入补丁文件。在这个示例中，将 `import "
 "vllm_ascend.patch.platform.patch_common.patch_distributed` 添加到 "
 "`vllm_ascend/patch/platform/patch_common/__init__.py` 中。"
 #: ../../developer_guide/feature_guide/patch.md:63
 msgid ""
 "Add the description of the patch in `vllm_ascend/patch/__init__.py`. The "
 "description format is as follows:"
 msgstr "在 `vllm_ascend/patch/__init__.py` 中添加补丁的描述。描述格式如下："
 #: ../../developer_guide/feature_guide/patch.md:77
 msgid ""
 "Add the Unit Test and E2E Test. Any newly added code in vLLM Ascend should "
 "contain the Unit Test and E2E Test as well. You can find more details in "
 "[test guide](../contribution/testing.md)"
 msgstr ""
 "添加单元测试和端到端（E2E）测试。在 vLLM Ascend 中新增的任何代码也应包含单元测试和端到端测试。更多详情请参见 "
 "[测试指南](../contribution/testing.md)。"
 #: ../../developer_guide/feature_guide/patch.md:80
 msgid "Limitation"
 msgstr "限制"
 #: ../../developer_guide/feature_guide/patch.md:81
 msgid ""
 "In V1 Engine, vLLM starts three kinds of process: Main process, EngineCore "
 "process and Worker process. Now vLLM Ascend only support patch the code in "
 "Main process and Worker process by default. If you want to patch the code "
 "runs in EngineCore process, you should patch EngineCore process entirely "
 "during setup, the entry code is here `vllm.v1.engine.core`. Please override "
 "`EngineCoreProc` and `DPEngineCoreProc` entirely."
 msgstr ""
 "在 V1 引擎中，vLLM 会启动三种类型的进程：主进程、EngineCore 进程和 Worker 进程。现在 vLLM Ascend "
 "默认只支持在主进程和 Worker 进程中打补丁代码。如果你想要在 EngineCore 进程中打补丁，你需要在设置阶段对 EngineCore "
 "进程整体打补丁，入口代码在 `vllm.v1.engine.core`。请完全重写 `EngineCoreProc` 和 "
 "`DPEngineCoreProc`。"
 #: ../../developer_guide/feature_guide/patch.md:82
 msgid ""
 "If you are running an edited vLLM code, the version of the vLLM may be "
 "changed automatically. For example, if you runs an edited vLLM based on "
 "v0.9.n, the version of vLLM may be change to v0.9.nxxx, in this case, the "
 "patch for v0.9.n in vLLM Ascend would not work as expect, because that vLLM "
 "Ascend can't distinguish the version of vLLM you're using. In this case, you"
 " can set the environment variable `VLLM_VERSION` to specify the version of "
 "vLLM you're using, then the patch for v0.9.2 should work."
 msgstr ""
 "如果你运行的是经过编辑的 vLLM 代码，vLLM 的版本可能会被自动更改。例如，如果你基于 v0.9.n 运行了编辑后的 vLLM，vLLM "
 "的版本可能会变为 v0.9.nxxx，在这种情况下，vLLM Ascend 的 v0.9.n 补丁将无法正常工作，因为 vLLM Ascend "
 "无法区分你所使用的 vLLM 版本。这时，你可以设置环境变量 `VLLM_VERSION` 来指定你所使用的 vLLM 版本，这样对 v0.9.2 "
 "的补丁就应该可以正常工作。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_model.po
@@ -0,0 +1,333 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/modeling/adding_a_new_model.md:1
 msgid "Adding a New Model"
 msgstr "添加新模型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:3
 msgid ""
 "This guide demonstrates how to integrate a novel or customized model into "
 "vllm-ascend. For foundational concepts, it is highly recommended to refer to"
 " [vllm official doc: Adding a New "
 "Model](https://docs.vllm.ai/en/stable/contributing/model/) first."
 msgstr ""
 "本指南演示如何将新颖或自定义的模型集成到 vllm-ascend 中。对于基础概念，强烈建议先参考 [vllm "
 "官方文档：添加新模型](https://docs.vllm.ai/en/stable/contributing/model/)。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:6
 msgid "Step 1: Implementing Models with `torch` and `torch_npu`"
 msgstr "步骤 1：使用 `torch` 和 `torch_npu` 实现模型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:8
 msgid ""
 "This section provides instructions for implementing new models compatible "
 "with vllm and vllm-ascend."
 msgstr "本节提供了实现与 vllm 和 vllm-ascend 兼容的新模型的相关说明。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:10
 msgid "**Before starting:**"
 msgstr "**开始之前：**"
 #: ../../developer_guide/modeling/adding_a_new_model.md:12
 msgid ""
 "Verify whether your model already exists in vllm's "
 "[models](https://github.com/vllm-"
 "project/vllm/tree/main/vllm/model_executor/models) directory."
 msgstr ""
 "请确认你的模型是否已经存在于 vllm 的 [models](https://github.com/vllm-"
 "project/vllm/tree/main/vllm/model_executor/models) 目录中。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:13
 msgid ""
 "Use existing models' implementation as templates to accelerate your "
 "development."
 msgstr "使用已有模型的实现作为模板以加速您的开发。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:15
 msgid "Method 1: Implementing New Models from Scratch"
 msgstr "方法一：从零开始实现新模型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:17
 msgid ""
 "Follow vllm's [OPT model "
 "adaptation](https://docs.vllm.ai/en/stable/contributing/model/basic.html) "
 "example for guidance."
 msgstr ""
 "请参考 vllm 的 [OPT "
 "模型适配](https://docs.vllm.ai/en/stable/contributing/model/basic.html) 示例进行操作。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:19
 msgid "**Key implementation requirements:**"
 msgstr "**关键实现要求：**"
 #: ../../developer_guide/modeling/adding_a_new_model.md:21
 msgid "Place model files in `vllm_ascend/models/` directory."
 msgstr "请将模型文件放在 `vllm_ascend/models/` 目录下。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:23
 msgid ""
 "Standard module structure for decoder-only LLMs (please checkout vllm's "
 "implementations for other kinds of model):"
 msgstr "解码器-only LLMs 的标准模块结构（请参考 vllm 对其他类型模型的实现）："
 #: ../../developer_guide/modeling/adding_a_new_model.md:25
 msgid "`*ModelForCausalLM` (top-level wrapper)"
 msgstr "`*ModelForCausalLM`（顶层包装器）"
 #: ../../developer_guide/modeling/adding_a_new_model.md:26
 msgid "`*Model` (main architecture)"
 msgstr "`*Model`（主架构）"
 #: ../../developer_guide/modeling/adding_a_new_model.md:27
 msgid "`*DecoderLayer` (transformer block)"
 msgstr "`*DecoderLayer` （transformer 块）"
 #: ../../developer_guide/modeling/adding_a_new_model.md:28
 msgid "`*Attention` and `*MLP` (specific computation unit)"
 msgstr "`*Attention` 和 `*MLP`（特定计算单元）"
 #: ../../developer_guide/modeling/adding_a_new_model.md:31
 msgid "`*` denotes your model's unique identifier."
 msgstr "`*` 表示你的模型的唯一标识符。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:34
 msgid "Critical Implementation Details:"
 msgstr "关键实现细节："
 #: ../../developer_guide/modeling/adding_a_new_model.md:36
 msgid "All modules must include a `prefix` argument in `__init__()`."
 msgstr "所有模块在 `__init__()` 方法中都必须包含一个 `prefix` 参数。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:38
 msgid "**Required interfaces:**"
 msgstr "**必需的接口：**"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "Module Type"
 msgstr "模块类型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "Required Methods"
 msgstr "必需的方法"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "`*ModelForCausalLM`"
 msgstr "`*ModelForCausalLM`"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "`get_input_embeddings`, `compute_logits`, `load_weights`"
 msgstr "`get_input_embeddings`，`compute_logits`，`load_weights`"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "`*Model`"
 msgstr "`*模型`"
 #: ../../developer_guide/modeling/adding_a_new_model.md:30
 msgid "`get_input_embeddings`, `load_weights`"
 msgstr "`get_input_embeddings`，`load_weights`"
 #: ../../developer_guide/modeling/adding_a_new_model.md:45
 msgid "Attention Backend Integration:"
 msgstr "注意后端集成："
 #: ../../developer_guide/modeling/adding_a_new_model.md:47
 msgid ""
 "Importing attention via `from vllm.attention import Attention` can "
 "automatically leverage the attention backend routing of vllm-ascend (see: "
 "`get_attn_backend_cls()` in `vllm_ascend/platform.py`)."
 msgstr ""
 "通过 `from vllm.attention import Attention` 导入 attention 可以自动利用 vllm-ascend "
 "的注意力后端路由（详见：`vllm_ascend/platform.py` 中的 `get_attn_backend_cls()`）。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:49
 msgid "Tensor Parallelism:"
 msgstr "张量并行："
 #: ../../developer_guide/modeling/adding_a_new_model.md:51
 msgid ""
 "Use vllm's parallel layers (`ColumnParallelLinear`, "
 "`VocabParallelEmbedding`, etc.) to implement models supporting tensor "
 "parallelism. Note that Ascend-specific customizations are implemented in "
 "`vllm_ascend/ops/` directory (RMSNorm, VocabParallelEmbedding, etc.)."
 msgstr ""
 "使用 vllm 的并行层（如 `ColumnParallelLinear`、`VocabParallelEmbedding` "
 "等）来实现支持张量并行的模型。需要注意的是，Ascend 特有的自定义实现（如 RMSNorm、VocabParallelEmbedding 等）位于 "
 "`vllm_ascend/ops/` 目录下。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:53
 msgid ""
 "**Reference Implementation Template** (assumed path: "
 "`vllm_ascend/models/custom_model.py`):"
 msgstr "**参考实现模板**（假定路径：`vllm_ascend/models/custom_model.py`）："
 #: ../../developer_guide/modeling/adding_a_new_model.md:135
 msgid "Method 2: Customizing Existing vLLM Models"
 msgstr "方法二：自定义已有的 vLLM 模型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:137
 msgid ""
 "For most use cases, extending existing implementations is preferable. We "
 "demonstrate an example to inherit from base classes and implement a custom "
 "deepseek model below (assumed path: `vllm_ascend/models/deepseek_v2.py`)."
 msgstr ""
 "对于大多数使用场景，建议扩展已有的实现。我们在下面演示了一个示例，通过继承基类并实现一个自定义的 deepseek "
 "模型（假定路径：`vllm_ascend/models/deepseek_v2.py`）。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:175
 msgid ""
 "For a complete implementation reference, see: "
 "`vllm_ascend/models/deepseek_v2.py`."
 msgstr "完整的实现参考请见：`vllm_ascend/models/deepseek_v2.py`。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:178
 msgid "Step 2: Registering Custom Models using ModelRegistry Plugins in vLLM"
 msgstr "第2步：使用 vLLM 中的 ModelRegistry 插件注册自定义模型"
 #: ../../developer_guide/modeling/adding_a_new_model.md:180
 msgid ""
 "vllm provides a plugin mechanism for registering externally implemented "
 "models without modifying its codebase."
 msgstr "vllm 提供了一种插件机制，可用于注册外部实现的模型，而无需修改其代码库。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:182
 msgid ""
 "To integrate your implemented model from `vllm_ascend/models/` directory:"
 msgstr "要集成你在 `vllm_ascend/models/` 目录下实现的模型："
 #: ../../developer_guide/modeling/adding_a_new_model.md:184
 msgid ""
 "Import your model implementation in `vllm_ascend/models/__init__.py` using "
 "relative imports."
 msgstr "使用相对导入在 `vllm_ascend/models/__init__.py` 中导入你的模型实现。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:185
 msgid ""
 "Register the model wrapper class via `vllm.ModelRegistry.register_model()` "
 "function."
 msgstr "通过 `vllm.ModelRegistry.register_model()` 函数注册模型包装类。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:187
 msgid ""
 "**Reference Registration Template** (an example of registering new models in"
 " `vllm_ascend/models/__init__.py`):"
 msgstr "**参考注册模板**（在 `vllm_ascend/models/__init__.py` 注册新模型的示例）："
 #: ../../developer_guide/modeling/adding_a_new_model.md:210
 msgid ""
 "The first argument of `vllm.ModelRegistry.register_model()` indicates the "
 "unique architecture identifier which must match `architectures` in "
 "`config.json` of the model."
 msgstr ""
 "`vllm.ModelRegistry.register_model()` 的第一个参数表示唯一的架构标识符，这个标识符必须与模型的 "
 "`config.json` 文件中的 `architectures` 匹配。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:221
 msgid "Step 3: Verification"
 msgstr "第 3 步：验证"
 #: ../../developer_guide/modeling/adding_a_new_model.md:223
 msgid "Case 1: Overriding Existing vLLM Model Architecture"
 msgstr "案例 1：重载已有的 vLLM 模型架构"
 #: ../../developer_guide/modeling/adding_a_new_model.md:225
 msgid ""
 "If you're registering a customized model architecture based on vllm's "
 "existing implementation (overriding vllm's original class), when executing "
 "vllm offline/online inference (using any model), you'll observe warning logs"
 " similar to the following output from "
 "`vllm/models_executor/models/registry.py`."
 msgstr ""
 "如果你基于 vllm 的现有实现注册了一个自定义的模型架构（覆盖了 vllm 的原始类），在执行 vllm "
 "的离线/在线推理（无论使用哪个模型）时，你会看到类似于 `vllm/models_executor/models/registry.py` "
 "输出的警告日志。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:231
 msgid "Case 2: Registering New Model Architecture"
 msgstr "案例2：注册新模型架构"
 #: ../../developer_guide/modeling/adding_a_new_model.md:233
 msgid ""
 "If you're registering a novel model architecture not present in vllm "
 "(creating a completely new class), current logs won't provide explicit "
 "confirmation by default. It's recommended to add the following logging "
 "statement at the end of the `register_model` method in "
 "`vllm/models_executor/models/registry.py`."
 msgstr ""
 "如果你注册了 vllm 中不存在的新模型架构（创建一个全新的类），当前日志默认不会提供明确的确认信息。建议在 "
 "`vllm/models_executor/models/registry.py` 文件中的 `register_model` "
 "方法末尾添加如下日志语句。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:239
 msgid ""
 "After adding this line, you will see confirmation logs shown below when "
 "running vllm offline/online inference (using any model)."
 msgstr "添加这一行之后，当你运行 vllm 离线/在线推理（使用任何模型）时，将会看到如下确认日志。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:245
 msgid ""
 "This log output confirms your novel model architecture has been successfully"
 " registered in vllm."
 msgstr "该日志输出确认了你的新模型架构已成功在 vllm 中注册。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:247
 msgid "Step 4: Testing"
 msgstr "第4步：测试"
 #: ../../developer_guide/modeling/adding_a_new_model.md:249
 msgid ""
 "After adding a new model, we should do basic functional test (offline/online"
 " inference), accuracy test and performance benchmark for the model."
 msgstr "在添加新模型后，我们应对该模型进行基本功能测试（离线/在线推理）、准确率测试和性能基准测试。"
 #: ../../developer_guide/modeling/adding_a_new_model.md:251
 msgid "Find more details at:"
 msgstr "更多详情请见："
 #: ../../developer_guide/modeling/adding_a_new_model.md:253
 msgid ""
 "[Accuracy test guide](https://vllm-"
 "ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html)"
 msgstr ""
 "[精度测试指南](https://vllm-"
 "ascend.readthedocs.io/en/latest/developer_guide/evaluation/index.html)"
 #: ../../developer_guide/modeling/adding_a_new_model.md:254
 msgid ""
 "[Performance benchmark guide](https://vllm-"
 "ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)"
 msgstr ""
 "[性能基准指南](https://vllm-"
 "ascend.readthedocs.io/en/latest/developer_guide/performance/performance_benchmark.html)"
 #: ../../developer_guide/modeling/adding_a_new_model.md:256
 msgid "Step 5: Updating Supported Models Doc"
 msgstr "第5步：更新支持的模型文档"
 #: ../../developer_guide/modeling/adding_a_new_model.md:258
 msgid ""
 "At last, if all the steps above are completed, you should add the new model "
 "into our [Supported Models](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/supported_models.html) doc."
 msgstr ""
 "最后，如果以上所有步骤都已完成，你应该将新模型添加到我们的[支持的模型](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/supported_models.html)文档中。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/adding_a_new_multimodal_model.po
@@ -0,0 +1,29 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:1
 msgid "Adding a New Multi-Modal Model"
 msgstr "添加新的多模态模型"
 #: ../../developer_guide/modeling/adding_a_new_multimodal_model.md:3
 msgid "**_Comming soon ..._**"
 msgstr "**_敬请期待 ..._**"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/modeling/index.po
@@ -0,0 +1,32 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/modeling/index.md:1
 #: ../../developer_guide/modeling/index.md:5
 msgid "Modeling"
 msgstr "新模型"
 #: ../../developer_guide/modeling/index.md:3
 msgid ""
 "This section provides tutorials of how to implement and register a new model"
 " into vllm-ascend."
 msgstr "本节提供了如何在 vllm-ascend 中实现并注册新模型的教程。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/index.po
@@ -0,0 +1,26 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/performance/index.md:1
 #: ../../developer_guide/performance/index.md:3
 msgid "Performance"
 msgstr "性能"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/performance_benchmark.po
@@ -0,0 +1,88 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/performance/performance_benchmark.md:1
 msgid "Performance Benchmark"
 msgstr "性能基准"
 #: ../../developer_guide/performance/performance_benchmark.md:2
 msgid ""
 "This document details the benchmark methodology for vllm-ascend, aimed at "
 "evaluating the performance under a variety of workloads. To maintain "
 "alignment with vLLM, we use the [benchmark](https://github.com/vllm-"
 "project/vllm/tree/main/benchmarks) script provided by the vllm project."
 msgstr ""
 "本文档详细说明了 vllm-ascend 的基准测试方法，旨在评估其在多种工作负载下的性能。为了与 vLLM 保持一致，我们使用 vllm 项目提供的 "
 "[benchmark](https://github.com/vllm-project/vllm/tree/main/benchmarks) 脚本。"
 #: ../../developer_guide/performance/performance_benchmark.md:4
 msgid ""
 "**Benchmark Coverage**: We measure offline e2e latency and throughput, and "
 "fixed-QPS online serving benchmarks, for more details see [vllm-ascend "
 "benchmark scripts](https://github.com/vllm-project/vllm-"
 "ascend/tree/main/benchmarks)."
 msgstr ""
 "**基准测试覆盖范围**：我们测量离线端到端延迟和吞吐量，以及固定 QPS 的在线服务基准测试。更多详情请参见 [vllm-ascend "
 "基准测试脚本](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks)。"
 #: ../../developer_guide/performance/performance_benchmark.md:6
 msgid "1. Run docker container"
 msgstr "1. 运行 docker 容器"
 #: ../../developer_guide/performance/performance_benchmark.md:31
 msgid "2. Install dependencies"
 msgstr "2. 安装依赖项"
 #: ../../developer_guide/performance/performance_benchmark.md:38
 msgid "3. (Optional)Prepare model weights"
 msgstr "3.（可选）准备模型权重"
 #: ../../developer_guide/performance/performance_benchmark.md:39
 msgid ""
 "For faster running speed, we recommend downloading the model in advance："
 msgstr "为了更快的运行速度，建议提前下载模型："
 #: ../../developer_guide/performance/performance_benchmark.md:44
 msgid ""
 "You can also replace all model paths in the [json](https://github.com/vllm-"
 "project/vllm-ascend/tree/main/benchmarks/tests) files with your local paths:"
 msgstr ""
 "你也可以将 [json](https://github.com/vllm-project/vllm-"
 "ascend/tree/main/benchmarks/tests) 文件中的所有模型路径替换为你的本地路径："
 #: ../../developer_guide/performance/performance_benchmark.md:60
 msgid "4. Run benchmark script"
 msgstr "4. 运行基准测试脚本"
 #: ../../developer_guide/performance/performance_benchmark.md:61
 msgid "Run benchmark script:"
 msgstr "运行基准测试脚本："
 #: ../../developer_guide/performance/performance_benchmark.md:66
 msgid "After about 10 mins, the output is as shown below:"
 msgstr "大约 10 分钟后，输出如下所示："
 #: ../../developer_guide/performance/performance_benchmark.md:176
 msgid ""
 "The result json files are generated into the path `benchmark/results` These "
 "files contain detailed benchmarking results for further analysis."
 msgstr "结果 json 文件会生成到路径 `benchmark/results`。这些文件包含了用于进一步分析的详细基准测试结果。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/developer_guide/performance/profile_execute_duration.po
@@ -0,0 +1,81 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../developer_guide/performance/profile_execute_duration.md:1
 msgid "Profile Execute Duration"
 msgstr "配置执行持续时间"
 #: ../../developer_guide/performance/profile_execute_duration.md:3
 msgid ""
 "The execution duration of each stage (including pre/post-processing, model "
 "forward, etc.) usually needs to be captured during a complete inference "
 "process. Typically, this is done by using `torch.npu.synchronize()` and "
 "obtaining CPU timestamps, which increases the performance overhead of "
 "host/device synchronization."
 msgstr ""
 "在完整的推理过程中，通常需要记录每个阶段（包括前/后处理、模型前向等）的执行时长。一般通过使用 `torch.npu.synchronize()` "
 "并获取 CPU 时间戳来实现，这会增加主机/设备同步的性能开销。"
 #: ../../developer_guide/performance/profile_execute_duration.md:5
 msgid ""
 "**To reduce the performance overhead, we add this feature, using the NPU "
 "event timestamp mechanism to observe the device execution time "
 "asynchronously.**"
 msgstr "**为了减少性能开销，我们添加了此功能，使用 NPU 事件时间戳机制异步观测设备的执行时间。**"
 #: ../../developer_guide/performance/profile_execute_duration.md:7
 msgid "Usage"
 msgstr "用法"
 #: ../../developer_guide/performance/profile_execute_duration.md:8
 msgid ""
 "Use the environment variable `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` to "
 "enable this feature."
 msgstr "使用环境变量 `VLLM_ASCEND_MODEL_EXECUTE_TIME_OBSERVE` 来启用此功能。"
 #: ../../developer_guide/performance/profile_execute_duration.md:9
 msgid ""
 "Use the non-blocking API `ProfileExecuteDuration().capture_async` to set "
 "observation points asynchronously when you need to observe the execution "
 "duration."
 msgstr ""
 "当你需要观察执行时长时，可以使用非阻塞 API `ProfileExecuteDuration().capture_async` 异步设置观察点。"
 #: ../../developer_guide/performance/profile_execute_duration.md:10
 msgid ""
 "Use the blocking API `ProfileExecuteDuration().pop_captured_sync` at an "
 "appropriate time to get and print the execution durations of all observed "
 "stages."
 msgstr ""
 "在适当的时机使用阻塞式 API `ProfileExecuteDuration().pop_captured_sync` "
 "获取并打印所有已观察到阶段的执行时长。"
 #: ../../developer_guide/performance/profile_execute_duration.md:12
 msgid ""
 "**We have instrumented the key inference stages (including pre-processing, "
 "model forward pass, etc.) for execute duration profiling. Execute the script"
 " as follows:**"
 msgstr "**我们已经对关键的推理阶段（包括预处理、模型前向传递等）进行了执行时长分析的检测。请按如下方式执行脚本：**"
 #: ../../developer_guide/performance/profile_execute_duration.md:17
 msgid "Example Output"
 msgstr "示例输出"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/faqs.po
@@ -0,0 +1,479 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../faqs.md:1
 msgid "FAQs"
 msgstr ""
 #: ../../faqs.md:3
 msgid "Version Specific FAQs"
 msgstr "特定版本常见问题"
 #: ../../faqs.md:5
 msgid ""
 "[[v0.7.3.post1] FAQ & Feedback](https://github.com/vllm-project/vllm-"
 "ascend/issues/1007)"
 msgstr ""
 "[[v0.7.3.post1] 常见问题与反馈](https://github.com/vllm-project/vllm-"
 "ascend/issues/1007)"
 #: ../../faqs.md:6
 msgid ""
 "[[v0.9.2rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-"
 "ascend/issues/1742)"
 msgstr ""
 "[[v0.9.2rc1] 常见问题与反馈](https://github.com/vllm-project/vllm-"
 "ascend/issues/1742)"
 #: ../../faqs.md:8
 msgid "General FAQs"
 msgstr "常见问题解答"
 #: ../../faqs.md:10
 msgid "1. What devices are currently supported?"
 msgstr "1. 目前支持哪些设备？"
 #: ../../faqs.md:12
 msgid ""
 "Currently, **ONLY** Atlas A2 series(Ascend-cann-kernels-910b) and Atlas "
 "300I(Ascend-cann-kernels-310p) series are supported:"
 msgstr ""
 "目前，**仅**支持 Atlas A2 系列（Ascend-cann-kernels-910b）和 Atlas 300I（Ascend-cann-"
 "kernels-310p）系列："
 #: ../../faqs.md:14
 msgid ""
 "Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 "
 "Box16, Atlas 300T A2)"
 msgstr ""
 "Atlas A2 训练系列（Atlas 800T A2，Atlas 900 A2 PoD，Atlas 200T A2 Box16，Atlas 300T "
 "A2）"
 #: ../../faqs.md:15
 msgid "Atlas 800I A2 Inference series (Atlas 800I A2)"
 msgstr "Atlas 800I A2 推理系列（Atlas 800I A2）"
 #: ../../faqs.md:16
 msgid "Atlas 300I Inference series (Atlas 300I Duo)"
 msgstr "Atlas 300I 推理系列（Atlas 300I Duo）"
 #: ../../faqs.md:18
 msgid "Below series are NOT supported yet:"
 msgstr "以下系列目前尚不受支持："
 #: ../../faqs.md:19
 msgid "Atlas 200I A2 (Ascend-cann-kernels-310b) unplanned yet"
 msgstr "Atlas 200I A2（Ascend-cann-kernels-310b）尚未计划"
 #: ../../faqs.md:20
 msgid "Ascend 910, Ascend 910 Pro B (Ascend-cann-kernels-910) unplanned yet"
 msgstr "Ascend 910，Ascend 910 Pro B（Ascend-cann-kernels-910）尚未计划"
 #: ../../faqs.md:22
 msgid ""
 "From a technical view, vllm-ascend support would be possible if the torch-"
 "npu is supported. Otherwise, we have to implement it by using custom ops. We"
 " are also welcome to join us to improve together."
 msgstr ""
 "从技术角度来看，如果支持 torch-npu，则可以支持 vllm-ascend。否则，我们需要通过自定义算子来实现。我们也欢迎大家一起加入，共同改进。"
 #: ../../faqs.md:24
 msgid "2. How to get our docker containers?"
 msgstr "2. 如何获取我们的 docker 容器？"
 #: ../../faqs.md:26
 msgid ""
 "You can get our containers at `Quay.io`, e.g., [<u>vllm-"
 "ascend</u>](https://quay.io/repository/ascend/vllm-ascend?tab=tags) and "
 "[<u>cann</u>](https://quay.io/repository/ascend/cann?tab=tags)."
 msgstr ""
 "你可以在 `Quay.io` 获取我们的容器，例如，[<u>vllm-"
 "ascend</u>](https://quay.io/repository/ascend/vllm-ascend?tab=tags) 和 "
 "[<u>cann</u>](https://quay.io/repository/ascend/cann?tab=tags)。"
 #: ../../faqs.md:28
 msgid ""
 "If you are in China, you can use `daocloud` to accelerate your downloading:"
 msgstr "如果你在中国，可以使用 `daocloud` 来加速下载："
 #: ../../faqs.md:36
 msgid "3. What models does vllm-ascend supports?"
 msgstr "3. vllm-ascend 支持哪些模型？"
 #: ../../faqs.md:38
 msgid ""
 "Find more details [<u>here</u>](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)."
 msgstr ""
 "在[<u>此处</u>](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_models.html)查看更多详细信息。"
 #: ../../faqs.md:40
 msgid "4. How to get in touch with our community?"
 msgstr "4. 如何与我们的社区取得联系？"
 #: ../../faqs.md:42
 msgid ""
 "There are many channels that you can communicate with our community "
 "developers / users:"
 msgstr "你可以通过多种渠道与我们的社区开发者/用户进行交流："
 #: ../../faqs.md:44
 msgid ""
 "Submit a GitHub [<u>issue</u>](https://github.com/vllm-project/vllm-"
 "ascend/issues?page=1)."
 msgstr ""
 "提交一个 GitHub [<u>issue</u>](https://github.com/vllm-project/vllm-"
 "ascend/issues?page=1)。"
 #: ../../faqs.md:45
 msgid ""
 "Join our [<u>weekly "
 "meeting</u>](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z)"
 " and share your ideas."
 msgstr ""
 "加入我们的[<u>每周会议</u>](https://docs.google.com/document/d/1hCSzRTMZhIB8vRq1_qOOjx4c9uYUxvdQvDsMV2JcSrw/edit?tab=t.0#heading=h.911qu8j8h35z)，并分享你的想法。"
 #: ../../faqs.md:46
 msgid ""
 "Join our [<u>WeChat</u>](https://github.com/vllm-project/vllm-"
 "ascend/issues/227) group and ask your quenstions."
 msgstr ""
 "加入我们的 [<u>微信群</u>](https://github.com/vllm-project/vllm-ascend/issues/227) "
 "并提问你的问题。"
 #: ../../faqs.md:47
 msgid ""
 "Join our ascend channel in [<u>vLLM "
 "forums</u>](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-"
 "support/6) and publish your topics."
 msgstr ""
 "加入我们在 [<u>vLLM 论坛</u>](https://discuss.vllm.ai/c/hardware-support/vllm-"
 "ascend-support/6) 的 ascend 频道并发布你的话题。"
 #: ../../faqs.md:49
 msgid "5. What features does vllm-ascend V1 supports?"
 msgstr "5. vllm-ascend V1 支持哪些功能？"
 #: ../../faqs.md:51
 msgid ""
 "Find more details [<u>here</u>](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)."
 msgstr ""
 "在[<u>这里</u>](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)找到更多详细信息。"
 #: ../../faqs.md:53
 msgid ""
 "6. How to solve the problem of \"Failed to infer device type\" or "
 "\"libatb.so: cannot open shared object file\"?"
 msgstr "6. 如何解决“无法推断设备类型”或“libatb.so：无法打开共享对象文件”问题？"
 #: ../../faqs.md:55
 msgid ""
 "Basically, the reason is that the NPU environment is not configured "
 "correctly. You can:"
 msgstr "基本上，原因是 NPU 环境没有正确配置。你可以："
 #: ../../faqs.md:56
 msgid ""
 "try `source /usr/local/Ascend/nnal/atb/set_env.sh` to enable NNAL package."
 msgstr "尝试运行 `source /usr/local/Ascend/nnal/atb/set_env.sh` 以启用 NNAL 包。"
 #: ../../faqs.md:57
 msgid ""
 "try `source /usr/local/Ascend/ascend-toolkit/set_env.sh` to enable CANN "
 "package."
 msgstr "尝试运行 `source /usr/local/Ascend/ascend-toolkit/set_env.sh` 以启用 CANN 包。"
 #: ../../faqs.md:58
 msgid "try `npu-smi info` to check whether the NPU is working."
 msgstr "尝试运行 `npu-smi info` 来检查 NPU 是否正常工作。"
 #: ../../faqs.md:60
 msgid ""
 "If all above steps are not working, you can try the following code with "
 "python to check whether there is any error:"
 msgstr "如果以上所有步骤都无效，你可以尝试使用以下 python 代码来检查是否有错误："
 #: ../../faqs.md:68
 msgid ""
 "If all above steps are not working, feel free to submit a GitHub issue."
 msgstr "如果以上所有步骤都无法解决问题，欢迎提交一个 GitHub issue。"
 #: ../../faqs.md:70
 msgid "7. How does vllm-ascend perform?"
 msgstr "7. vllm-ascend 的性能如何？"
 #: ../../faqs.md:72
 msgid ""
 "Currently, only some models are improved. Such as `Qwen2.5 VL`, `Qwen3`, "
 "`Deepseek  V3`. Others are not good enough. From 0.9.0rc2, Qwen and Deepseek"
 " works with graph mode to play a good performance. What's more, you can "
 "install `mindie-turbo` with `vllm-ascend v0.7.3` to speed up the inference "
 "as well."
 msgstr ""
 "目前，只有部分模型得到了改进，比如 `Qwen2.5 VL`、`Qwen3` 和 `Deepseek V3`。其他模型的效果还不够理想。从 "
 "0.9.0rc2 开始，Qwen 和 Deepseek 已经支持图模式，以获得更好的性能。此外，你还可以在 `vllm-ascend v0.7.3` "
 "上安装 `mindie-turbo`，进一步加速推理。"
 #: ../../faqs.md:74
 msgid "8. How vllm-ascend work with vllm?"
 msgstr "8. vllm-ascend 如何与 vllm 协同工作？"
 #: ../../faqs.md:75
 msgid ""
 "vllm-ascend is a plugin for vllm. Basically, the version of vllm-ascend is "
 "the same as the version of vllm. For example, if you use vllm 0.7.3, you "
 "should use vllm-ascend 0.7.3 as well. For main branch, we will make sure "
 "`vllm-ascend` and `vllm` are compatible by each commit."
 msgstr ""
 "vllm-ascend 是 vllm 的一个插件。基本上，vllm-ascend 的版本与 vllm 的版本是相同的。例如，如果你使用 vllm "
 "0.7.3，你也应该使用 vllm-ascend 0.7.3。对于主分支，我们会确保每次提交都让 `vllm-ascend` 和 `vllm` "
 "保持兼容。"
 #: ../../faqs.md:77
 msgid "9. Does vllm-ascend support Prefill Disaggregation feature?"
 msgstr "9. vllm-ascend 支持 Prefill Disaggregation 功能吗？"
 #: ../../faqs.md:79
 msgid ""
 "Currently, only 1P1D is supported on V0 Engine. For V1 Engine or NPND "
 "support, We will make it stable and supported by vllm-ascend in the future."
 msgstr "目前，V0引擎只支持1P1D。对于V1引擎或NPND的支持，我们将在未来使其稳定并由vllm-ascend支持。"
 #: ../../faqs.md:81
 msgid "10. Does vllm-ascend support quantization method?"
 msgstr "10. vllm-ascend 支持量化方法吗？"
 #: ../../faqs.md:83
 msgid ""
 "Currently, w8a8 quantization is already supported by vllm-ascend originally "
 "on v0.8.4rc2 or higher, If you're using vllm 0.7.3 version, w8a8 "
 "quantization is supporeted with the integration of vllm-ascend and mindie-"
 "turbo, please use `pip install vllm-ascend[mindie-turbo]`."
 msgstr ""
 "目前，w8a8 量化已在 v0.8.4rc2 或更高版本的 vllm-ascend 中原生支持。如果你使用的是 vllm 0.7.3 版本，集成了 "
 "vllm-ascend 和 mindie-turbo 后也支持 w8a8 量化，请使用 `pip install vllm-ascend[mindie-"
 "turbo]`。"
 #: ../../faqs.md:85
 msgid "11. How to run w8a8 DeepSeek model?"
 msgstr "11. 如何运行 w8a8 DeepSeek 模型？"
 #: ../../faqs.md:87
 msgid ""
 "Please following the [inferencing tutorail](https://vllm-"
 "ascend.readthedocs.io/en/latest/tutorials/multi_node.html) and replace model"
 " to DeepSeek."
 msgstr ""
 "请按照[inferencing 教程](https://vllm-"
 "ascend.readthedocs.io/en/latest/tutorials/multi_node.html)进行操作，并将模型更换为 "
 "DeepSeek。"
 #: ../../faqs.md:89
 msgid ""
 "12. There is no output in log when loading models using vllm-ascend, How to "
 "solve it?"
 msgstr "12. 使用 vllm-ascend 加载模型时日志没有输出，如何解决？"
 #: ../../faqs.md:91
 msgid ""
 "If you're using vllm 0.7.3 version, this is a known progress bar display "
 "issue in VLLM, which has been resolved in [this PR](https://github.com/vllm-"
 "project/vllm/pull/12428), please cherry-pick it locally by yourself. "
 "Otherwise, please fill up an issue."
 msgstr ""
 "如果你正在使用 vllm 0.7.3 版本，这是 VLLM 已知的进度条显示问题，已在 [此 PR](https://github.com/vllm-"
 "project/vllm/pull/12428) 中解决，请自行在本地进行 cherry-pick。否则，请提交一个 issue。"
 #: ../../faqs.md:93
 msgid "13. How vllm-ascend is tested"
 msgstr "13. 如何测试 vllm-ascend"
 #: ../../faqs.md:95
 msgid ""
 "vllm-ascend is tested by functional test, performance test and accuracy "
 "test."
 msgstr "vllm-ascend 经过功能测试、性能测试和精度测试。"
 #: ../../faqs.md:97
 msgid ""
 "**Functional test**: we added CI, includes portion of vllm's native unit "
 "tests and vllm-ascend's own unit tests，on vllm-ascend's test, we test basic "
 "functionality、popular models availability and [supported "
 "features](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)"
 " via e2e test"
 msgstr ""
 "**功能测试**：我们添加了CI，包含了vllm原生单元测试的一部分以及vllm-ascend自己的单元测试。在vllm-"
 "ascend的测试中，我们通过e2e测试验证了基本功能、主流模型可用性和[支持的特性](https://vllm-"
 "ascend.readthedocs.io/en/latest/user_guide/support_matrix/supported_features.html)。"
 #: ../../faqs.md:99
 msgid ""
 "**Performance test**: we provide [benchmark](https://github.com/vllm-"
 "project/vllm-ascend/tree/main/benchmarks) tools for end-to-end performance "
 "benchmark which can easily to re-route locally, we'll publish a perf website"
 " to show the performance test results for each pull request"
 msgstr ""
 "**性能测试**：我们提供了用于端到端性能基准测试的[基准测试](https://github.com/vllm-project/vllm-"
 "ascend/tree/main/benchmarks)工具，可以方便地在本地重新运行。我们将发布一个性能网站，用于展示每个拉取请求的性能测试结果。"
 #: ../../faqs.md:101
 msgid ""
 "**Accuracy test**: we're working on adding accuracy test to CI as well."
 msgstr "**准确性测试**：我们也在努力将准确性测试添加到CI中。"
 #: ../../faqs.md:103
 msgid ""
 "Finnall, for each release, we'll publish the performance test and accuracy "
 "test report in the future."
 msgstr "最后，未来每个版本发布时，我们都会公开性能测试和准确性测试报告。"
 #: ../../faqs.md:105
 msgid "14. How to fix the error \"InvalidVersion\" when using vllm-ascend?"
 msgstr "14. 使用 vllm-ascend 时如何解决 “InvalidVersion” 错误？"
 #: ../../faqs.md:106
 msgid ""
 "It's usually because you have installed an dev/editable version of vLLM "
 "package. In this case, we provide the env variable `VLLM_VERSION` to let "
 "users specify the version of vLLM package to use. Please set the env "
 "variable `VLLM_VERSION` to the version of vLLM package you have installed. "
 "The format of `VLLM_VERSION` should be `X.Y.Z`."
 msgstr ""
 "这通常是因为你安装了开发版或可编辑版本的 vLLM 包。在这种情况下，我们提供了环境变量 `VLLM_VERSION`，以便用户指定要使用的 vLLM "
 "包版本。请将环境变量 `VLLM_VERSION` 设置为你已安装的 vLLM 包的版本。`VLLM_VERSION` 的格式应为 `X.Y.Z`。"
 #: ../../faqs.md:108
 msgid "15. How to handle Out Of Memory?"
 msgstr "15. 如何处理内存溢出？"
 #: ../../faqs.md:109
 msgid ""
 "OOM errors typically occur when the model exceeds the memory capacity of a "
 "single NPU. For general guidance, you can refer to [vLLM's OOM "
 "troubleshooting "
 "documentation](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-"
 "of-memory)."
 msgstr ""
 "当模型超出单个 NPU 的内存容量时，通常会发生 OOM（内存溢出）错误。一般性的指导可以参考 [vLLM 的 OOM "
 "故障排除文档](https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#out-"
 "of-memory)。"
 #: ../../faqs.md:111
 msgid ""
 "In scenarios where NPUs have limited HBM (High Bandwidth Memory) capacity, "
 "dynamic memory allocation/deallocation during inference can exacerbate "
 "memory fragmentation, leading to OOM. To address this:"
 msgstr ""
 "在 NPU 的 HBM（高带宽内存）容量有限的场景下，推理过程中动态内存分配和释放会加剧内存碎片，从而导致 OOM（内存溢出）。为了解决这个问题："
 #: ../../faqs.md:113
 msgid ""
 "**Adjust `--gpu-memory-utilization`**: If unspecified, will use the default "
 "value of `0.9`. You can decrease this param to reserve more memory to reduce"
 " fragmentation risks. See more note in: [vLLM - Inference and Serving - "
 "Engine "
 "Arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-"
 "_engine_args_parser-cacheconfig)."
 msgstr ""
 "**调整 `--gpu-memory-utilization`**：如果未指定，将使用默认值 "
 "`0.9`。你可以降低此参数来预留更多内存，从而降低内存碎片风险。参见更多说明：[vLLM - 推理与服务 - "
 "引擎参数](https://docs.vllm.ai/en/latest/serving/engine_args.html#vllm.engine.arg_utils-"
 "_engine_args_parser-cacheconfig)。"
 #: ../../faqs.md:115
 msgid ""
 "**Configure `PYTORCH_NPU_ALLOC_CONF`**: Set this environment variable to "
 "optimize NPU memory management. For example, you can `export "
 "PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` to enable virtual memory "
 "feature to mitigate memory fragmentation caused by frequent dynamic memory "
 "size adjustments during runtime, see more note in: "
 "[PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)."
 msgstr ""
 "**配置 `PYTORCH_NPU_ALLOC_CONF`**：设置此环境变量以优化NPU内存管理。例如，你可以通过 `export "
 "PYTORCH_NPU_ALLOC_CONF=expandable_segments:True` "
 "来启用虚拟内存功能，以缓解运行时频繁动态调整内存大小导致的内存碎片问题，更多说明参见：[PYTORCH_NPU_ALLOC_CONF](https://www.hiascend.com/document/detail/zh/Pytorch/700/comref/Envvariables/Envir_012.html)。"
 #: ../../faqs.md:117
 msgid "16. Failed to enable NPU graph mode when running DeepSeek?"
 msgstr "16. 运行 DeepSeek 时无法启用 NPU 图模式？"
 #: ../../faqs.md:118
 #, python-brace-format
 msgid ""
 "You may encounter the following error if running DeepSeek with NPU graph "
 "mode enabled. The allowed number of queries per kv when enabling both MLA "
 "and Graph mode only support {32, 64, 128}, **Thus this is not supported for "
 "DeepSeek-V2-Lite**, as it only has 16 attention heads. The NPU graph mode "
 "support on DeepSeek-V2-Lite will be done in the future."
 msgstr ""
 "如果在启用NPU图模式（Graph "
 "mode）运行DeepSeek时，您可能会遇到以下错误。当同时启用MLA和图模式时，每个kv允许的查询数只支持{32, 64, "
 "128}，**因此这不支持DeepSeek-V2-Lite**，因为它只有16个注意力头。未来会增加对DeepSeek-V2-Lite在NPU图模式下的支持。"
 #: ../../faqs.md:120
 #, python-brace-format
 msgid ""
 "And if you're using DeepSeek-V3 or DeepSeek-R1, please make sure after the "
 "tensor parallel split, num_heads / num_kv_heads in {32, 64, 128}."
 msgstr ""
 "如果你正在使用 DeepSeek-V3 或 DeepSeek-R1，请确保在张量并行切分后，num_heads / num_kv_heads 的值为 "
 "{32, 64, 128} 中的一个。"
 #: ../../faqs.md:127
 msgid ""
 "17. Failed to reinstall vllm-ascend from source after uninstalling vllm-"
 "ascend?"
 msgstr "17. 卸载 vllm-ascend 后无法从源码重新安装 vllm-ascend？"
 #: ../../faqs.md:128
 msgid ""
 "You may encounter the problem of C compilation failure when reinstalling "
 "vllm-ascend from source using pip. If the installation fails, it is "
 "recommended to use `python setup.py install` to install, or use `python "
 "setup.py clean` to clear the cache."
 msgstr ""
 "当你使用 pip 从源码重新安装 vllm-ascend 时，可能会遇到 C 编译失败的问题。如果安装失败，建议使用 `python setup.py "
 "install` 进行安装，或者使用 `python setup.py clean` 清除缓存。"
 #: ../../faqs.md:130
 msgid "18. How to generate determinitic results when using vllm-ascend?"
 msgstr "18. 使用 vllm-ascend 时如何生成确定性结果？"
 #: ../../faqs.md:131
 msgid "There are several factors that affect output certainty:"
 msgstr "有几个因素会影响输出的确定性："
 #: ../../faqs.md:133
 msgid ""
 "Sampler Method: using **Greedy sample** by setting `temperature=0` in "
 "`SamplingParams`, e.g.:"
 msgstr ""
 "采样方法：通过在 `SamplingParams` 中设置 `temperature=0` 来使用 **贪婪采样（Greedy "
 "sample）**，例如："
 #: ../../faqs.md:158
 msgid "Set the following enveriments parameters:"
 msgstr "设置以下环境参数："
--- a/docs/source/locale/zh_CN/LC_MESSAGES/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/index.po
@@ -0,0 +1,79 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: 2025-07-18 10:05+0800\n"
 "Last-Translator: \n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 "X-Generator: Poedit 3.5\n"
 #: ../../index.md:33
 msgid "Getting Started"
 msgstr "快速开始"
 #: ../../index.md:43
 msgid "User Guide"
 msgstr "用户指南"
 #: ../../index.md:53
 msgid "Developer Guide"
 msgstr "开发者指南"
 #: ../../index.md:64
 msgid "Community"
 msgstr "社区"
 #: ../../index.md:1
 msgid "Welcome to vLLM Ascend Plugin"
 msgstr "欢迎使用 vLLM Ascend 插件"
 #: ../../index.md:3
 msgid "vLLM"
 msgstr "vLLM"
 #: ../../index.md:24
 msgid ""
 "vLLM Ascend plugin (vllm-ascend) is a community maintained hardware plugin "
 "for running vLLM on the Ascend NPU."
 msgstr ""
 "vLLM Ascend 插件（vllm-ascend）是一个由社区维护的硬件插件，用于在 Ascend "
 "NPU 上运行 vLLM。"
 #: ../../index.md:26
 msgid ""
 "This plugin is the recommended approach for supporting the Ascend backend "
 "within the vLLM community. It adheres to the principles outlined in the "
 "[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/"
 "issues/11162), providing a hardware-pluggable interface that decouples the "
 "integration of the Ascend NPU with vLLM."
 msgstr ""
 "该插件是 vLLM 社区推荐用于支持 Ascend 后端的方法。它遵循 [[RFC]: Hardware "
 "pluggable](https://github.com/vllm-project/vllm/issues/11162) 中提出的原"
 "则，提供了一个硬件可插拔接口，实现了 Ascend NPU 与 vLLM 集成的解耦。"
 #: ../../index.md:28
 msgid ""
 "By using vLLM Ascend plugin, popular open-source models, including "
 "Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run "
 "seamlessly on the Ascend NPU."
 msgstr ""
 "通过使用 vLLM Ascend 插件，流行的开源模型，包括 Transformer 类、混合专家、"
 "嵌入式、多模态大模型等，都可以在 Ascend NPU 上无缝运行。"
 #: ../../index.md:30
 msgid "Documentation"
 msgstr "文档"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/installation.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/installation.po
@@ -0,0 +1,293 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: 2025-07-18 10:09+0800\n"
 "Last-Translator: \n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 "X-Generator: Poedit 3.5\n"
 #: ../../installation.md:1
 msgid "Installation"
 msgstr "安装"
 #: ../../installation.md:3
 msgid "This document describes how to install vllm-ascend manually."
 msgstr "本文档介绍如何手动安装 vllm-ascend。"
 #: ../../installation.md:5
 msgid "Requirements"
 msgstr "要求"
 #: ../../installation.md:7
 msgid "OS: Linux"
 msgstr "操作系统：Linux"
 #: ../../installation.md:8
 msgid "Python: >= 3.9, < 3.12"
 msgstr "Python：>= 3.9，< 3.12"
 #: ../../installation.md:9
 msgid "A hardware with Ascend NPU. It's usually the Atlas 800 A2 series."
 msgstr "配备有昇腾NPU的硬件，通常是Atlas 800 A2系列。"
 #: ../../installation.md:10
 msgid "Software:"
 msgstr "软件："
 #: ../../installation.md
 msgid "Software"
 msgstr "软件"
 #: ../../installation.md
 msgid "Supported version"
 msgstr "支持的版本"
 #: ../../installation.md
 msgid "Note"
 msgstr "注释"
 #: ../../installation.md
 msgid "CANN"
 msgstr "CANN"
 #: ../../installation.md
 msgid ">= 8.1.RC1"
 msgstr ">= 8.1.RC1"
 #: ../../installation.md
 msgid "Required for vllm-ascend and torch-npu"
 msgstr "vllm-ascend 和 torch-npu 必需"
 #: ../../installation.md
 msgid "torch-npu"
 msgstr "torch-npu"
 #: ../../installation.md
 msgid ">= 2.5.1.post1.dev20250619"
 msgstr ">= 2.5.1.post1.dev20250619"
 #: ../../installation.md
 msgid ""
 "Required for vllm-ascend, No need to install manually, it will be auto "
 "installed in below steps"
 msgstr "vllm-ascend 必需，无需手动安装，后续步骤会自动安装。"
 #: ../../installation.md
 msgid "torch"
 msgstr "torch"
 #: ../../installation.md
 msgid ">= 2.5.1"
 msgstr ">= 2.5.1"
 #: ../../installation.md
 msgid "Required for torch-npu and vllm"
 msgstr "torch-npu 和 vllm 所需"
 #: ../../installation.md:18
 msgid "You have 2 way to install:"
 msgstr "你有两种安装方式："
 #: ../../installation.md:19
 msgid ""
 "**Using pip**: first prepare env manually or via CANN image, then install "
 "`vllm-ascend` using pip."
 msgstr ""
 "**使用 pip**：首先手动准备环境或通过 CANN 镜像准备环境，然后使用 pip 安装 "
 "`vllm-ascend`。"
 #: ../../installation.md:20
 msgid ""
 "**Using docker**: use the `vllm-ascend` pre-built docker image directly."
 msgstr "**使用 docker**：直接使用 `vllm-ascend` 预构建的 docker 镜像。"
 #: ../../installation.md:22
 msgid "Configure a new environment"
 msgstr "配置一个新环境"
 #: ../../installation.md:24
 msgid ""
 "Before installing, you need to make sure firmware/driver and CANN are "
 "installed correctly, refer to [link](https://ascend.github.io/docs/sources/"
 "ascend/quick_install.html) for more details."
 msgstr ""
 "在安装之前，您需要确保固件/驱动和 CANN 已正确安装，更多详情请参考 [链接]"
 "(https://ascend.github.io/docs/sources/ascend/quick_install.html)。"
 #: ../../installation.md:26
 msgid "Configure hardware environment"
 msgstr "配置硬件环境"
 #: ../../installation.md:28
 msgid ""
 "To verify that the Ascend NPU firmware and driver were correctly installed, "
 "run:"
 msgstr "要验证 Ascend NPU 固件和驱动程序是否正确安装，请运行："
 #: ../../installation.md:34
 msgid ""
 "Refer to [Ascend Environment Setup Guide](https://ascend.github.io/docs/"
 "sources/ascend/quick_install.html) for more details."
 msgstr ""
 "更多详情请参考[Ascend环境搭建指南](https://ascend.github.io/docs/sources/"
 "ascend/quick_install.html)。"
 #: ../../installation.md:36
 msgid "Configure software environment"
 msgstr "配置软件环境"
 #: ../../installation.md
 msgid "Before using pip"
 msgstr "在使用 pip 之前"
 #: ../../installation.md:46
 msgid ""
 "The easiest way to prepare your software environment is using CANN image "
 "directly:"
 msgstr "最简单的方式是直接使用 CANN 镜像来准备您的软件环境："
 #: ../../installation.md
 msgid "Click here to see \"Install CANN manually\""
 msgstr "点击此处查看“手动安装 CANN”"
 #: ../../installation.md:72
 msgid "You can also install CANN manually:"
 msgstr "你也可以手动安装 CANN："
 #: ../../installation.md
 msgid "Before using docker"
 msgstr "在使用 docker 之前"
 #: ../../installation.md:104
 msgid ""
 "No more extra step if you are using `vllm-ascend` prebuilt docker image."
 msgstr "如果你使用 `vllm-ascend` 预构建的 docker 镜像，就无需额外的步骤。"
 #: ../../installation.md:108
 msgid "Once it's done, you can start to set up `vllm` and `vllm-ascend`."
 msgstr "完成后，你可以开始配置 `vllm` 和 `vllm-ascend`。"
 #: ../../installation.md:110
 msgid "Setup vllm and vllm-ascend"
 msgstr "安装 vllm 和 vllm-ascend"
 #: ../../installation.md
 msgid "Using pip"
 msgstr "使用 pip"
 #: ../../installation.md:121
 msgid "First install system dependencies and config pip mirror:"
 msgstr "首先安装系统依赖并配置 pip 镜像："
 #: ../../installation.md:133
 msgid ""
 "**[Optional]** Then config the extra-index of `pip` if you are working on a "
 "x86 machine or using torch-npu dev version:"
 msgstr ""
 "**[可选]** 如果你在 x86 机器上工作或使用 torch-npu 开发版，请配置 `pip` 的额"
 "外索引："
 #: ../../installation.md:140
 msgid ""
 "Then you can install `vllm` and `vllm-ascend` from **pre-built wheel**:"
 msgstr "然后你可以从**预编译的 wheel 包**安装 `vllm` 和 `vllm-ascend`："
 #: ../../installation.md
 msgid "Click here to see \"Build from source code\""
 msgstr "点击此处查看“从源代码构建”"
 #: ../../installation.md:153
 msgid "or build from **source code**:"
 msgstr "或者从**源代码**构建："
 #: ../../installation.md:171
 msgid ""
 "vllm-ascend will build custom ops by default. If you don't want to build "
 "it, set `COMPILE_CUSTOM_KERNELS=0` environment to disable it."
 msgstr ""
 "vllm-ascend 默认会编译自定义算子。如果你不想编译它，可以设置环境变量 "
 "`COMPILE_CUSTOM_KERNELS=0` 来禁用。"
 #: ../../installation.md:175
 msgid ""
 "If you are building from v0.7.3-dev and intend to use sleep mode feature, "
 "you should set `COMPILE_CUSTOM_KERNELS=1` manually. To build custom ops, "
 "gcc/g++ higher than 8 and c++ 17 or higher is required. If you're using "
 "`pip install -e .` and encourage a torch-npu version conflict, please "
 "install with `pip install --no-build-isolation -e .` to build on system "
 "env. If you encounter other problems during compiling, it is probably "
 "because unexpected compiler is being used, you may export `CXX_COMPILER` "
 "and `C_COMPILER` in env to specify your g++ and gcc locations before "
 "compiling."
 msgstr ""
 "如果你是从 v0.7.3-dev 版本开始构建，并且打算使用休眠模式功能，你需要手动设"
 "置 `COMPILE_CUSTOM_KERNELS=1`。构建自定义算子时，要求 gcc/g++ 版本高于 8 且"
 "支持 c++ 17 或更高标准。如果你正在使用 `pip install -e .` 并且出现了 torch-"
 "npu 版本冲突，请使用 `pip install --no-build-isolation -e .` 在系统环境下进"
 "行安装。如果在编译过程中遇到其它问题，可能是因为使用了非预期的编译器，你可以"
 "在编译前通过环境变量导出 `CXX_COMPILER` 和 `C_COMPILER`，以指定你的 g++ 和 "
 "gcc 路径。"
 #: ../../installation.md
 msgid "Using docker"
 msgstr "使用 docker"
 #: ../../installation.md:184
 msgid "You can just pull the **prebuilt image** and run it with bash."
 msgstr "你可以直接拉取**预构建镜像**并用 bash 运行它。"
 #: ../../installation.md
 msgid "Click here to see \"Build from Dockerfile\""
 msgstr "点击这里查看“从 Dockerfile 构建”"
 #: ../../installation.md:187
 msgid "or build IMAGE from **source code**:"
 msgstr "或从**源代码**构建 IMAGE："
 #: ../../installation.md:218
 msgid ""
 "The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed "
 "in `/vllm-workspace` and installed in [development mode](https://setuptools."
 "pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to "
 "help developer immediately take place changes without requiring a new "
 "installation."
 msgstr ""
 "默认的工作目录是 `/workspace`，vLLM 和 vLLM Ascend 代码被放置在 `/vllm-"
 "workspace`，并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/"
 "development_mode.html)（`pip install -e`）安装，以便开发者能够即时生效更改，"
 "而无需重新安装。"
 #: ../../installation.md:222
 msgid "Extra information"
 msgstr "额外信息"
 #: ../../installation.md:224
 msgid "Verify installation"
 msgstr "验证安装"
 #: ../../installation.md:226
 msgid "Create and run a simple inference test. The `example.py` can be like:"
 msgstr "创建并运行一个简单的推理测试。`example.py` 可以如下："
 #: ../../installation.md:251
 msgid "Then run:"
 msgstr "然后运行："
 #: ../../installation.md:259
 msgid "The output will be like:"
 msgstr "输出将会像这样："
--- a/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/quick_start.po
@@ -0,0 +1,149 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: 2025-07-18 10:09+0800\n"
 "Last-Translator: \n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 "X-Generator: Poedit 3.5\n"
 #: ../../quick_start.md:1
 msgid "Quickstart"
 msgstr "快速入门"
 #: ../../quick_start.md:3
 msgid "Prerequisites"
 msgstr "先决条件"
 #: ../../quick_start.md:5
 msgid "Supported Devices"
 msgstr "支持的设备"
 #: ../../quick_start.md:6
 msgid ""
 "Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 "
 "Box16, Atlas 300T A2)"
 msgstr ""
 "Atlas A2 训练系列（Atlas 800T A2，Atlas 900 A2 PoD，Atlas 200T A2 Box16，"
 "Atlas 300T A2）"
 #: ../../quick_start.md:7
 msgid "Atlas 800I A2 Inference series (Atlas 800I A2)"
 msgstr "Atlas 800I A2 推理系列（Atlas 800I A2）"
 #: ../../quick_start.md:9
 msgid "Setup environment using container"
 msgstr "使用容器设置环境"
 #: ../../quick_start.md
 msgid "Ubuntu"
 msgstr "Ubuntu"
 #: ../../quick_start.md
 msgid "openEuler"
 msgstr "openEuler"
 #: ../../quick_start.md:69
 msgid ""
 "The default workdir is `/workspace`, vLLM and vLLM Ascend code are placed "
 "in `/vllm-workspace` and installed in [development mode](https://setuptools."
 "pypa.io/en/latest/userguide/development_mode.html)(`pip install -e`) to "
 "help developer immediately take place changes without requiring a new "
 "installation."
 msgstr ""
 "默认的工作目录是 `/workspace`，vLLM 和 vLLM Ascend 代码被放置在 `/vllm-"
 "workspace`，并以[开发模式](https://setuptools.pypa.io/en/latest/userguide/"
 "development_mode.html)（`pip install -e`）安装，以便开发者能够即时生效更改，"
 "而无需重新安装。"
 #: ../../quick_start.md:71
 msgid "Usage"
 msgstr "用法"
 #: ../../quick_start.md:73
 msgid "You can use Modelscope mirror to speed up download:"
 msgstr "你可以使用 Modelscope 镜像来加速下载："
 #: ../../quick_start.md:80
 msgid "There are two ways to start vLLM on Ascend NPU:"
 msgstr "在昇腾 NPU 上启动 vLLM 有两种方式："
 #: ../../quick_start.md
 msgid "Offline Batched Inference"
 msgstr "离线批量推理"
 #: ../../quick_start.md:86
 msgid ""
 "With vLLM installed, you can start generating texts for list of input "
 "prompts (i.e. offline batch inferencing)."
 msgstr ""
 "安装了 vLLM 后，您可以开始为一系列输入提示生成文本（即离线批量推理）。"
 #: ../../quick_start.md:88
 msgid ""
 "Try to run below Python script directly or use `python3` shell to generate "
 "texts:"
 msgstr ""
 "尝试直接运行下面的 Python 脚本，或者使用 `python3` 交互式命令行来生成文本："
 #: ../../quick_start.md
 msgid "OpenAI Completions API"
 msgstr "OpenAI Completions API"
 #: ../../quick_start.md:114
 msgid ""
 "vLLM can also be deployed as a server that implements the OpenAI API "
 "protocol. Run the following command to start the vLLM server with the [Qwen/"
 "Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) "
 "model:"
 msgstr ""
 "vLLM 也可以作为实现 OpenAI API 协议的服务器进行部署。运行以下命令，使用 "
 "[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-"
 "Instruct) 模型启动 vLLM 服务器："
 #: ../../quick_start.md:124
 msgid "If you see log as below:"
 msgstr "如果你看到如下日志："
 #: ../../quick_start.md:132
 msgid "Congratulations, you have successfully started the vLLM server!"
 msgstr "恭喜，你已经成功启动了 vLLM 服务器！"
 #: ../../quick_start.md:134
 msgid "You can query the list the models:"
 msgstr "你可以查询模型列表："
 #: ../../quick_start.md:141
 msgid "You can also query the model with input prompts:"
 msgstr "你也可以通过输入提示来查询模型："
 #: ../../quick_start.md:155
 msgid ""
 "vLLM is serving as background process, you can use `kill -2 $VLLM_PID` to "
 "stop the background process gracefully, it's equal to `Ctrl-C` to stop "
 "foreground vLLM process:"
 msgstr ""
 "vLLM 正作为后台进程运行，你可以使用 `kill -2 $VLLM_PID` 来优雅地停止后台进"
 "程，这等同于使用 `Ctrl-C` 停止前台 vLLM 进程："
 #: ../../quick_start.md:164
 msgid "You will see output as below:"
 msgstr "你将会看到如下输出："
 #: ../../quick_start.md:172
 msgid "Finally, you can exit container by using `ctrl-D`."
 msgstr "最后，你可以通过按 `ctrl-D` 退出容器。"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/index.po
@@ -0,0 +1,29 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../tutorials/index.md:3
 msgid "Deployment"
 msgstr "部署"
 #: ../../tutorials/index.md:1
 msgid "Tutorials"
 msgstr "教程"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_node.po
@@ -0,0 +1,192 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../tutorials/multi_node.md:1
 msgid "Multi-Node-DP (DeepSeek)"
 msgstr "多节点分布式处理（DeepSeek）"
 #: ../../tutorials/multi_node.md:3
 msgid "Getting Start"
 msgstr "快速开始"
 #: ../../tutorials/multi_node.md:4
 msgid ""
 "vLLM-Ascend now supports Data Parallel (DP) deployment, enabling model "
 "weights to be replicated across multiple NPUs or instances, each processing "
 "independent batches of requests. This is particularly useful for scaling "
 "throughput across devices while maintaining high resource utilization."
 msgstr ""
 "vLLM-Ascend 现在支持数据并行（DP）部署，可以在多个 NPU "
 "或实例之间复制模型权重，每个实例处理独立的请求批次。这对于在保证高资源利用率的同时，实现跨设备的吞吐量扩展特别有用。"
 #: ../../tutorials/multi_node.md:6
 msgid ""
 "Each DP rank is deployed as a separate “core engine” process which "
 "communicates with front-end process(es) via ZMQ sockets. Data Parallel can "
 "be combined with Tensor Parallel, in which case each DP engine owns a number"
 " of per-NPU worker processes equal to the TP size."
 msgstr ""
 "每个 DP 进程作为一个单独的“核心引擎”进程部署，并通过 ZMQ 套接字与前端进程通信。数据并行可以与张量并行结合使用，此时每个 DP "
 "引擎拥有数量等于 TP 大小的每 NPU 工作进程。"
 #: ../../tutorials/multi_node.md:8
 msgid ""
 "For Mixture-of-Experts (MoE) models — especially advanced architectures like"
 " DeepSeek that utilize Multi-head Latent Attention (MLA) — a hybrid "
 "parallelism approach is recommended:     - Use **Data Parallelism (DP)** for"
 " attention layers, which are replicated across devices and handle separate "
 "batches.     - Use **Expert or Tensor Parallelism (EP/TP)** for expert "
 "layers, which are sharded across devices to distribute the computation."
 msgstr ""
 "对于混合专家（Mixture-of-Experts, MoE）模型——尤其是像 DeepSeek 这样采用多头潜在注意力（Multi-head Latent Attention, MLA）的高级架构——推荐使用混合并行策略：\n"
 "    - 对于注意力层，使用 **数据并行（Data Parallelism, DP）**，这些层会在各设备间复刻，并处理不同的批次。\n"
 "    - 对于专家层，使用 **专家并行或张量并行（Expert or Tensor Parallelism, EP/TP）**，这些层会在设备间分片，从而分担计算。"
 #: ../../tutorials/multi_node.md:12
 msgid ""
 "This division enables attention layers to be replicated across Data Parallel"
 " (DP) ranks, enabling them to process different batches independently. "
 "Meanwhile, expert layers are partitioned (sharded) across devices using "
 "Expert or Tensor Parallelism(DP*TP), maximizing hardware utilization and "
 "efficiency."
 msgstr ""
 "这种划分使得注意力层能够在数据并行（DP）组内复制，从而能够独立处理不同的批次。同时，专家层通过专家或张量并行（DP*TP）在设备间进行分区（切片），最大化硬件利用率和效率。"
 #: ../../tutorials/multi_node.md:14
 msgid ""
 "In these cases the data parallel ranks are not completely independent, "
 "forward passes must be aligned and expert layers across all ranks are "
 "required to synchronize during every forward pass, even if there are fewer "
 "requests to be processed than DP ranks."
 msgstr ""
 "在这些情况下，数据并行的各个 rank 不是完全独立的，前向传播必须对齐，并且所有 rank "
 "上的专家层在每次前向传播时都需要同步，即使待处理的请求数量少于 DP rank 的数量。"
 #: ../../tutorials/multi_node.md:16
 msgid ""
 "For MoE models, when any requests are in progress in any rank, we must "
 "ensure that empty “dummy” forward passes are performed in all ranks which "
 "don’t currently have any requests scheduled. This is handled via a separate "
 "DP `Coordinator` process which communicates with all of the ranks, and a "
 "collective operation performed every N steps to determine when all ranks "
 "become idle and can be paused. When TP is used in conjunction with DP, "
 "expert layers form an EP or TP group of size (DP x TP)."
 msgstr ""
 "对于 MoE 模型，当任何一个 rank 有请求正在进行时，必须确保所有当前没有请求的 rank 都执行空的“虚拟”前向传播。这是通过一个单独的 DP "
 "`Coordinator` 协调器进程来实现的，该进程与所有 rank 通信，并且每隔 N 步执行一次集体操作，以判断所有 rank "
 "是否都处于空闲状态并可以暂停。当 TP 与 DP 结合使用时，专家层会组成一个规模为（DP x TP）的 EP 或 TP 组。"
 #: ../../tutorials/multi_node.md:18
 msgid "Verify Multi-Node Communication Environment"
 msgstr "验证多节点通信环境"
 #: ../../tutorials/multi_node.md:20
 msgid "Physical Layer Requirements:"
 msgstr "物理层要求："
 #: ../../tutorials/multi_node.md:22
 msgid ""
 "The physical machines must be located on the same WLAN, with network "
 "connectivity."
 msgstr "物理机器必须位于同一个 WLAN 中，并且具有网络连接。"
 #: ../../tutorials/multi_node.md:23
 msgid ""
 "All NPUs are connected with optical modules, and the connection status must "
 "be normal."
 msgstr "所有 NPU 都通过光模块连接，且连接状态必须正常。"
 #: ../../tutorials/multi_node.md:25
 msgid "Verification Process:"
 msgstr "验证流程："
 #: ../../tutorials/multi_node.md:27
 msgid ""
 "Execute the following commands on each node in sequence. The results must "
 "all be `success` and the status must be `UP`:"
 msgstr "在每个节点上依次执行以下命令。所有结果必须为 `success` 且状态必须为 `UP`："
 #: ../../tutorials/multi_node.md:44
 msgid "NPU Interconnect Verification:"
 msgstr "NPU 互连验证："
 #: ../../tutorials/multi_node.md:45
 msgid "1. Get NPU IP Addresses"
 msgstr "1. 获取 NPU IP 地址"
 #: ../../tutorials/multi_node.md:50
 msgid "2. Cross-Node PING Test"
 msgstr "2. 跨节点PING测试"
 #: ../../tutorials/multi_node.md:56
 msgid "Run with docker"
 msgstr "用 docker 运行"
 #: ../../tutorials/multi_node.md:57
 msgid ""
 "Assume you have two Atlas 800 A2(64G*8) nodes, and want to deploy the "
 "`deepseek-v3-w8a8` quantitative model across multi-node."
 msgstr "假设你有两台 Atlas 800 A2（64G*8）节点，并且想要在多节点上部署 `deepseek-v3-w8a8` 量化模型。"
 #: ../../tutorials/multi_node.md:92
 msgid ""
 "Before launch the inference server, ensure some environment variables are "
 "set for multi node communication"
 msgstr "在启动推理服务器之前，确保已经为多节点通信设置了一些环境变量。"
 #: ../../tutorials/multi_node.md:95
 msgid "Run the following scripts on two nodes respectively"
 msgstr "分别在两台节点上运行以下脚本"
 #: ../../tutorials/multi_node.md:97
 msgid "**node0**"
 msgstr "**节点0**"
 #: ../../tutorials/multi_node.md:137
 msgid "**node1**"
 msgstr "**节点1**"
 #: ../../tutorials/multi_node.md:176
 msgid ""
 "The Deployment view looks like:  ![alt text](../assets/multi_node_dp.png)"
 msgstr "部署视图如下所示：![替代文本](../assets/multi_node_dp.png)"
 #: ../../tutorials/multi_node.md:176
 msgid "alt text"
 msgstr "替代文本"
 #: ../../tutorials/multi_node.md:179
 msgid ""
 "Once your server is started, you can query the model with input prompts:"
 msgstr "一旦你的服务器启动，你可以通过输入提示词来查询模型："
 #: ../../tutorials/multi_node.md:192
 msgid "Run benchmarks"
 msgstr "运行基准测试"
 #: ../../tutorials/multi_node.md:193
 msgid ""
 "For details please refer to [benchmark](https://github.com/vllm-"
 "project/vllm-ascend/tree/main/benchmarks)"
 msgstr ""
 "详细信息请参阅 [benchmark](https://github.com/vllm-project/vllm-"
 "ascend/tree/main/benchmarks)"
--- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu.po
@@ -0,0 +1,62 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../tutorials/multi_npu.md:1
 msgid "Multi-NPU (QwQ 32B)"
 msgstr "多-NPU（QwQ 32B）"
 #: ../../tutorials/multi_npu.md:3
 msgid "Run vllm-ascend on Multi-NPU"
 msgstr "在多NPU上运行 vllm-ascend"
 #: ../../tutorials/multi_npu.md:5
 msgid "Run docker container:"
 msgstr "运行 docker 容器："
 #: ../../tutorials/multi_npu.md:30
 msgid "Setup environment variables:"
 msgstr "设置环境变量："
 #: ../../tutorials/multi_npu.md:40
 msgid "Online Inference on Multi-NPU"
 msgstr "多NPU的在线推理"
 #: ../../tutorials/multi_npu.md:42
 msgid "Run the following script to start the vLLM server on Multi-NPU:"
 msgstr "运行以下脚本，在多NPU上启动 vLLM 服务器："
 #: ../../tutorials/multi_npu.md:48
 msgid ""
 "Once your server is started, you can query the model with input prompts"
 msgstr "一旦服务器启动，就可以通过输入提示词来查询模型。"
 #: ../../tutorials/multi_npu.md:63
 msgid "Offline Inference on Multi-NPU"
 msgstr "多NPU离线推理"
 #: ../../tutorials/multi_npu.md:65
 msgid "Run the following script to execute offline inference on multi-NPU:"
 msgstr "运行以下脚本以在多NPU上执行离线推理："
 #: ../../tutorials/multi_npu.md:102
 msgid "If you run this script successfully, you can see the info shown below:"
 msgstr "如果你成功运行此脚本，你可以看到如下所示的信息："
--- a/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po
+++ b/docs/source/locale/zh_CN/LC_MESSAGES/tutorials/multi_npu_moge.po
@@ -0,0 +1,86 @@
 # SOME DESCRIPTIVE TITLE.
 # Copyright (C) 2025, vllm-ascend team
 # This file is distributed under the same license as the vllm-ascend
 # package.
 # FIRST AUTHOR <EMAIL@ADDRESS>, 2025.
 #
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: vllm-ascend\n"
 "Report-Msgid-Bugs-To: \n"
 "POT-Creation-Date: 2025-07-18 09:01+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language-Team: zh_CN <LL@li.org>\n"
 "Language: zh_CN\n"
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
 "Plural-Forms: nplurals=1; plural=0;\n"
 "Generated-By: Babel 2.17.0\n"
 #: ../../tutorials/multi_npu_moge.md:1
 msgid "Multi-NPU (Pangu Pro MoE)"
 msgstr "多NPU（Pangu Pro MoE）"
 #: ../../tutorials/multi_npu_moge.md:3
 msgid "Run vllm-ascend on Multi-NPU"
 msgstr "在多NPU上运行 vllm-ascend"
 #: ../../tutorials/multi_npu_moge.md:5
 msgid "Run container:"
 msgstr "运行容器："
 #: ../../tutorials/multi_npu_moge.md:30
 msgid "Setup environment variables:"
 msgstr "设置环境变量："
 #: ../../tutorials/multi_npu_moge.md:37
 msgid "Download the model:"
 msgstr "下载该模型："
 #: ../../tutorials/multi_npu_moge.md:44
 msgid "Online Inference on Multi-NPU"
 msgstr "多NPU上的在线推理"
 #: ../../tutorials/multi_npu_moge.md:46
 msgid "Run the following script to start the vLLM server on Multi-NPU:"
 msgstr "运行以下脚本，在多NPU上启动 vLLM 服务器："
 #: ../../tutorials/multi_npu_moge.md:55
 msgid ""
 "Once your server is started, you can query the model with input prompts:"
 msgstr "一旦你的服务器启动，你可以通过输入提示词来查询模型："
 #: ../../tutorials/multi_npu_moge.md
 msgid "v1/completions"
 msgstr "v1/补全"
 #: ../../tutorials/multi_npu_moge.md
 msgid "v1/chat/completions"
 msgstr "v1/chat/completions"
 #: ../../tutorials/multi_npu_moge.md:96
 msgid "If you run this successfully, you can see the info shown below:"
 msgstr "如果你成功运行这个，你可以看到如下所示的信息："
 #: ../../tutorials/multi_npu_moge.md:102
 msgid "Offline Inference on Multi-NPU"
 msgstr "多NPU离线推理"
 #: ../../tutorials/multi_npu_moge.md:104
 msgid "Run the following script to execute offline inference on multi-NPU:"
 msgstr "运行以下脚本以在多NPU上执行离线推理："
 #: ../../tutorials/multi_npu_moge.md
 msgid "Graph Mode"
 msgstr "图模式"
 #: ../../tutorials/multi_npu_moge.md
 msgid "Eager Mode"
 msgstr "即时模式"
 #: ../../tutorials/multi_npu_moge.md:230
 msgid "If you run this script successfully, you can see the info shown below:"
 msgstr "如果你成功运行此脚本，你可以看到如下所示的信息："
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,3 @@`
							`# Contributing to vLLM Ascend`

							`You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally.`
		`@@ -0,0 +1,3 @@`
							`# Adding a New Multi-Modal Model`

							`_Comming soon ..._`