add env vars & misc

update other platforms' Dockerfile
support multi npu partially
2026-02-11 06:27:58 +00:00 · 2026-01-23 03:24:25 +00:00 · 2026-01-09 04:36:39 +00:00 · 2026-01-07 07:42:30 +00:00 · 2026-01-05 20:33:31 +08:00 · 2026-01-05 11:31:07 +00:00
137 changed files with 1084 additions and 406 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,11 +77,12 @@ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")

 if(SOC_VERSION STREQUAL "ASCEND310P3")
    file(GLOB VLLM_ASCEND_SRC
-    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp)
 else()
    file(GLOB VLLM_ASCEND_SRC
    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
 endif()

@@ -94,7 +95,7 @@ include_directories(
  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
-  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
+  ${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/include
 )

 set(
--- a/6
+++ b/6
@@ -59,9 +59,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -51,9 +51,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export SOC_VERSION=ASCEND310P3 && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -49,9 +49,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
    export SOC_VERSION=ASCEND310P3 && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -20,11 +20,13 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG=v0.3.7.post2
+ARG SOC_VERSION="ascend910_9391"

 COPY . /vllm-workspace/vllm-ascend/
 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} \
+    SOC_VERSION=$SOC_VERSION

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -58,9 +60,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -20,8 +20,10 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 ARG COMPILE_CUSTOM_KERNELS=1
 ARG MOONCAKE_TAG="v0.3.7.post2"
+ARG SOC_VERSION="ascend910_9391"

-ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} \
+    SOC_VERSION=${SOC_VERSION}

 RUN pip config set global.index-url ${PIP_INDEX_URL}

@@ -61,9 +63,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -62,9 +62,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
    source /usr/local/Ascend/nnal/atb/set_env.sh && \
    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
+    make install && make clean && \
    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
    python3 -m pip cache purge

+ENV VLLM_ASCEND_ENABLE_NZ=0 \
+    VLLM_WORKER_MULTIPROC_METHOD=spawn \
+    VLLM_ASCEND_ENABLE_VNPU=1
+
 # Install modelscope (for fast download) and ray (for multinode)
 RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
    python3 -m pip cache purge
--- a/Dockerfile_backup/Dockerfile
+++ b/Dockerfile_backup/Dockerfile
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-910b-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install Mooncake dependencies
+RUN apt-get update -y && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile_backup/Dockerfile.310p
+++ b/Dockerfile_backup/Dockerfile.310p
@@ -0,0 +1,61 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile_backup/Dockerfile.310p.openEuler
+++ b/Dockerfile_backup/Dockerfile.310p.openEuler
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile_backup/Dockerfile.a3
+++ b/Dockerfile_backup/Dockerfile.a3
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG=v0.3.7.post2
+
+COPY . /vllm-workspace/vllm-ascend/
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+# Install Mooncake dependencies
+RUN apt-get update -y && \
+    apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile_backup/Dockerfile.a3.openEuler
+++ b/Dockerfile_backup/Dockerfile.a3.openEuler
@@ -0,0 +1,71 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+SHELL ["/bin/bash", "-c"]
+
+RUN yum update -y && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    ARCH=$(uname -m) && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
+    cd /vllm-workspace/Mooncake && \
+    bash mooncake_installer.sh -y && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/yum/*
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile_backup/Dockerfile.openEuler
+++ b/Dockerfile_backup/Dockerfile.openEuler
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+ARG MOONCAKE_TAG="v0.3.7.post2"
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+SHELL ["/bin/bash", "-c"]
+
+RUN yum update -y && \
+    yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
+    cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
+    ARCH=$(uname -m) && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
+    cd /vllm-workspace/Mooncake && \
+    bash mooncake_installer.sh -y && \
+    mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
+    make -j$(nproc) && make install && \
+    rm -fr /vllm-workspace/Mooncake/build && \
+    rm -rf /var/cache/yum/*
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/README-vllm-ascend.md
+++ b/README-vllm-ascend.md
@@ -0,0 +1,91 @@
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
+    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
+  </picture>
+</p>
+
+<h3 align="center">
+vLLM Ascend Plugin
+</h3>
+
+<p align="center">
+| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
+</p>
+
+<p align="center">
+<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
+</p>
+
+---
+*Latest News* 🔥
+- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
+- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
+- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
+- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
+- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
+- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
+---
+## Overview
+
+vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
+
+It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+
+By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
+
+## Prerequisites
+
+- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
+- OS: Linux
+- Software:
+  * Python >= 3.9, < 3.12
+  * CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
+  * PyTorch == 2.7.1, torch-npu == 2.7.1
+  * vLLM (the same version as vllm-ascend)
+
+## Getting Started
+
+Please use the following recommended versions to get started quickly:
+
+| Version    | Release type | Doc                                  |
+|------------|--------------|--------------------------------------|
+|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
+|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
+
+## Contributing
+See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+
+We welcome and value any contributions and collaborations:
+- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
+- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
+
+## Branch
+
+vllm-ascend has main branch and dev branch.
+
+- **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
+- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
+
+Below is maintained branches:
+
+| Branch     | Status       | Note                                 |
+|------------|--------------|--------------------------------------|
+| main       | Maintained   | CI commitment for vLLM main branch and vLLM v0.11.0 tag   |
+| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
+| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
+| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
+| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
+
+Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
+
+## Weekly Meeting
+
+- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
+- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
+
+## License
+
+Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
--- a/README-vllm-ascend.zh.md
+++ b/README-vllm-ascend.zh.md
--- a/README.md
+++ b/README.md
@@ -1,91 +1,38 @@
-<p align="center">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-dark.png">
-    <img alt="vllm-ascend" src="https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/docs/source/logos/vllm-ascend-logo-text-light.png" width=55%>
-  </picture>
-</p>
+# XC-LLM: A Specially Optimized LLM Inference Engine for ModelHub XC

-<h3 align="center">
-vLLM Ascend Plugin
-</h3>
-
-<p align="center">
-| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
-</p>
-
-<p align="center">
-<a ><b>English</b></a> | <a href="README.zh.md"><b>中文</b></a>
-</p>
-
---
-*Latest News* 🔥
- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
---
 ## Overview

-vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.
+The project is optimized based on the popular LLM inference project vLLM. The current version supports Ascend NPU (910B3 & 910B4). 

-It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+One of the key features of this project is efficient memory coordination, enabling multiple vLLM instances share and dynamically hold Ascend NPU's physical memory. When an instance is idle, model parameters are offloaded to host memory. Upon a new inference request, the model parameters are quickly restored to the NPU’s memory (if not exist), without the need to initialize the engine and load the model from scratch. As a result, from the application’s perspective, multiple LLM inference engines can run on the NPU even when their total memory requirements exceed the physical memory limit. This technique is referred to as `InfiniVRAM`.

-By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.

-## Prerequisites
+## Installation

- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
- OS: Linux
- Software:
-  * Python >= 3.9, < 3.12
-  * CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
-  * PyTorch == 2.7.1, torch-npu == 2.7.1
-  * vLLM (the same version as vllm-ascend)
+### Build from Dockerfile

-## Getting Started
+Clone this repository:

-Please use the following recommended versions to get started quickly:
+```bash
+docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
+```

-| Version    | Release type | Doc                                  |
-|------------|--------------|--------------------------------------|
-|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
-|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
+## Usage

-## Contributing
-See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+> [!NOTE]
+> Some platforms may not allow multiple containers to share the same Ascend NPU. You may try to use privilegd container to bypass this restriction and mount all NPUs, and set the env ASCEND_RT_VISIBLE_DEVICES to specify the target device to use.

-We welcome and value any contributions and collaborations:
- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.
+0. To share NPU, processes coordinate via shm, so you need to set all containers with `ipc=host`.
+1. Start a daemon process in a standalone container, by running `vllm_vnpu_daemon` installed inside the image.
+2. Start LLM services with this image, following the official usage instructions.
+3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).

-## Branch
+### Environment Variables
+- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
+- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.

-vllm-ascend has main branch and dev branch.

- **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.
+## Limitations

-Below is maintained branches:
-
-| Branch     | Status       | Note                                 |
-|------------|--------------|--------------------------------------|
-| main       | Maintained   | CI commitment for vLLM main branch and vLLM v0.11.0 tag   |
-| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
-| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
-| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
-| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |
-
-Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
-
-## Weekly Meeting
-
- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))
-
-## License
-
-Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
+- Restricted by the fact that HCCL cannot be shared, deploying more than one model with multi-GPU (e.g., TP) is not feasible currently.
+- The prefix cache will be reset when the LLM is restored, since we just simply discard the KV cache when the LLM is offloaded.
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@@ -19,7 +19,8 @@
 #include <string>
 #include <atomic>

-#include "idle_offload/shm_worker.h"
+#include "vnpu_offload/shm_worker.h"
+#include "vnpu_offload/npu_helper.h"

 extern "C" {

@@ -311,8 +312,9 @@ my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));

  if (!g_python_malloc_callback) {
-    throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
-                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+    throw std::runtime_error(
+        "my_malloc ERROR: g_python_malloc_callback not set." +
+        std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
  }

  // Acquire GIL (not in stable ABI officially, but often works)
@@ -345,8 +347,9 @@ __attribute__((visibility("default"))) void
 my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
  // get memory handle from the pointer
  if (!g_python_free_callback) {
-    throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
-                            std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
+    throw std::runtime_error(
+        "my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
+        __FILE__ + ":" + std::to_string(__LINE__));
  }

  // Acquire GIL (not in stable ABI officially, but often works)
@@ -474,8 +477,10 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
 static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
  PyObject* malloc_callback = nullptr;
  PyObject* free_callback = nullptr;
+  unsigned long long device = 0;

-  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+  if (!PyArg_ParseTuple(args, "OOK", &malloc_callback, &free_callback,
+                        &device)) {
    return nullptr;
  }

@@ -497,7 +502,13 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
  }
  g_initialized.store(true);

-  shm_worker = new ShmWorker();
+  std::vector<int> gpu_ids = get_npu_ids();
+  if (device >= gpu_ids.size()) {
+    throw std::runtime_error("Invalid device id: " + std::to_string(device) +
+                             " " + __FILE__ + ":" + std::to_string(__LINE__));
+  }
+  int gpu_id = gpu_ids[device];
+
  // get pid
  aclError error_code;
  int32_t pid;
@@ -508,11 +519,12 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
        std::to_string(error_code) + " " + __FILE__ + ":" +
        std::to_string(__LINE__));
  }
+
+  shm_worker = new ShmWorker();
  uint64_t shareable_handle;
-  shm_worker->register_worker(pid, &shareable_handle, &g_size);
+  shm_worker->register_worker(pid, gpu_id, &shareable_handle, &g_size);

  // import shareable handle
-  uint32_t device = 0;
  aclrtDrvMemHandle memHandle;
  error_code =
      aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
@@ -570,9 +582,16 @@ static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
  return tuple;
 }

-static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) {
-  bool prev_is_self = shm_worker->lock_gpu();
-  return PyBool_FromLong(prev_is_self);
+static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
+  bool prev_is_self = false;
+  bool success = shm_worker->try_lock_gpu(prev_is_self);
+  PyObject* tuple = PyTuple_New(2);
+  if (!tuple) {  
+    return nullptr;
+  }
+  PyTuple_SetItem(tuple, 0, PyBool_FromLong(success));
+  PyTuple_SetItem(tuple, 1, PyBool_FromLong(prev_is_self));
+  return tuple;
 }

 static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
@@ -597,7 +616,7 @@ static PyMethodDef module_methods[] = {
     "Unmap and release memory on the device."},
    {"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
     METH_NOARGS, "Get mem info in the reserved pool."},
-    {"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload,
+    {"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
     METH_NOARGS, "Lock GPU."},
    {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
     METH_NOARGS, "Unlock GPU."},
--- a/csrc/idle_offload/offload_daemon.cpp
+++ b/csrc/idle_offload/offload_daemon.cpp
@@ -1,198 +0,0 @@
-#include <iostream>
-#include <sys/types.h>
-
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <string.h>
-#include <vector>
-#include <atomic>
-#include <signal.h>
-
-#include "acl/acl.h"
-
-#include "shm_manager.h"
-#include "spdlog/spdlog.h"
-
-
-static constexpr size_t reserved_mem_size = 8ul * 1024 * 1024 * 1024; // 8GB
-
-static ShmManager *shm_manager = nullptr;
-
-void handle_signal(int sig) {
-  if (shm_manager) {
-    shm_manager->stop_busy_loop();
-  }
-}
-
-void install_signal_handlers() {
-    struct sigaction sa{};
-    sa.sa_handler = handle_signal;
-    sigemptyset(&sa.sa_mask);
-    sa.sa_flags = 0;
-
-    sigaction(SIGINT,  &sa, nullptr);
-    sigaction(SIGTERM, &sa, nullptr);
-    sigaction(SIGHUP,  &sa, nullptr);
-}
-
-void ensure_context(unsigned long long device) {
-  aclrtContext pctx;
-  aclrtGetCurrentContext(&pctx);
-  if (!pctx) {
-    // Ensure device context.
-    aclrtCreateContext(&pctx, device);
-    aclrtSetCurrentContext(pctx);
-  }
-}
-
-void init_acl() {
-  int32_t deviceId=0;
-  // aclrtStream stream;
-  bool g_isDevice;
-
-  aclError ret = aclrtSetDevice(deviceId);
-  if (ret != ACL_ERROR_NONE) {
-    throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
-                            std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__));
-  }
-}
-
-void reset_pids(const std::vector<int32_t> &pids, uint64_t shareable_handle) {
-  int cnt = pids.size();
-  if (cnt <= 0) {
-    return;
-  }
-
-  int32_t pids_data[cnt];
-  memcpy(pids_data, pids.data(), cnt * sizeof(int32_t));
-
-  aclError error_code =
-      aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt);
-  if (error_code != 0) {
-    spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}",
-                  error_code);
-    throw std::runtime_error("aclrtMemSetPidToShareableHandle failed");
-  } else {
-    spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}",
-                 cnt);
-  }
-}
-
-void start_daemon() {
-  init_acl();
-
-  aclError error_code;
-  size_t free_mem = 0, total = 0;
-  error_code = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total);
-  if (error_code != 0) {
-    spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
-    throw std::runtime_error("aclrtGetMemInfo failed");
-  } else {
-    spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem,
-                 total);
-  }
-
-  uint32_t device = 0;
-  aclrtPhysicalMemProp prop = {};
-  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
-  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
-  prop.memAttr = ACL_HBM_MEM_HUGE;
-  prop.location.id = device;
-  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
-  prop.reserve = 0;
-
-  size_t granularity;
-  error_code = aclrtMemGetAllocationGranularity(
-      &prop, ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, &granularity);
-  if (error_code != 0) {
-    spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}", error_code);
-    throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
-  } else {
-    spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}",
-                 granularity);
-  }
-  if (free_mem < reserved_mem_size) {
-    spdlog::error("Not enough free memory to reserve: {}, free_mem: {}",
-                  reserved_mem_size, free_mem);
-    throw std::runtime_error("Not enough free memory to reserve");
-  }
-  size_t g_size = free_mem - reserved_mem_size;
-  g_size = (g_size / granularity) * granularity;
-
-  // allocate physical memory
-  aclrtDrvMemHandle mem_handle;
-  error_code = aclrtMallocPhysical(&mem_handle, g_size, &prop, 0);
-  if (error_code != 0) {
-    spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
-    throw std::runtime_error("aclrtMallocPhysical failed");
-  } else {
-    spdlog::info("aclrtMallocPhysical succeeded, size: {}", g_size);
-  }
-
-  // // reserve address
-  // void *vmem_addr = nullptr;
-  // error_code = aclrtReserveMemAddress(&vmem_addr, g_size, 0, nullptr, 0);
-  // if (error_code != 0) {
-  //   spdlog::error("aclrtReserveMemAddress failed, error_code: {}", error_code);
-  //   throw std::runtime_error("aclrtReserveMemAddress failed");
-  // } else {
-  //   spdlog::info("aclrtReserveMemAddress succeeded, vmem_addr: {}", vmem_addr);
-  // }
-
-  // // map
-  // error_code = aclrtMapMem(vmem_addr, g_size, 0, mem_handle, 0);
-  // if (error_code != 0) {
-  //   spdlog::error("aclrtMapMem failed, error_code: {}", error_code);
-  //   throw std::runtime_error("aclrtMapMem failed");
-  // } else {
-  //   spdlog::info("aclrtMapMem succeeded, vmem_addr: {}", vmem_addr);
-  // }
-
-  // export
-  uint64_t shareable_handle;
-  error_code = aclrtMemExportToShareableHandle(
-      mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT,
-      &shareable_handle);
-  if (error_code != 0) {
-    spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}",
-                  error_code);
-    throw std::runtime_error("aclrtMemExportToShareableHandle failed");
-  } else {
-    spdlog::info(
-        "aclrtMemExportToShareableHandle succeeded, shareable_handle: {}",
-        shareable_handle);
-  }
-
-  // shm
-  shm_manager = new ShmManager();
-  shm_manager->set_gpu_info(g_size, shareable_handle);
-  shm_manager->register_callback_on_worker_change(
-      [&](const std::vector<int32_t> &pids) {
-        reset_pids(pids, shareable_handle);
-      });
-
-  // start busy loop
-  shm_manager->run_busy_loop();
-
-  // stopped by signal
-  delete shm_manager;
-  shm_manager = nullptr;
-
-  // free physical memory
-  error_code = aclrtFreePhysical(mem_handle);
-  if (error_code != 0) {
-    spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code);
-    throw std::runtime_error("aclrtFreePhysical failed");
-  }
-}
-
-
-int main() {
-  install_signal_handlers();
-
-  start_daemon();
-
-  return 0;
-}
--- a/csrc/vnpu_offload/.gitignore
+++ b/csrc/vnpu_offload/.gitignore
--- a/csrc/vnpu_offload/Makefile
+++ b/csrc/vnpu_offload/Makefile
@@ -1,12 +1,12 @@
 CXX := g++
 TARGET := vllm_vnpu_daemon
-SRCS := offload_daemon.cpp shm_manager.cpp
+SRCS := vnpu_daemon.cpp shm_manager.cpp

 ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
 INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
 LIBS := -L$(ASCEND_HOME)/lib64 -lascendcl

-CXXFLAGS := $(INCLUDES)
+CXXFLAGS := $(INCLUDES) -O2
 LDFLAGS := $(LIBS)

 PREFIX ?= /usr/local
--- a/csrc/vnpu_offload/include/spdlog/async.h
+++ b/csrc/vnpu_offload/include/spdlog/async.h
--- a/csrc/vnpu_offload/include/spdlog/async_logger-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/async_logger-inl.h
--- a/csrc/vnpu_offload/include/spdlog/async_logger.h
+++ b/csrc/vnpu_offload/include/spdlog/async_logger.h
--- a/csrc/vnpu_offload/include/spdlog/cfg/argv.h
+++ b/csrc/vnpu_offload/include/spdlog/cfg/argv.h
--- a/csrc/vnpu_offload/include/spdlog/cfg/env.h
+++ b/csrc/vnpu_offload/include/spdlog/cfg/env.h
--- a/csrc/vnpu_offload/include/spdlog/cfg/helpers-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/cfg/helpers-inl.h
--- a/csrc/vnpu_offload/include/spdlog/cfg/helpers.h
+++ b/csrc/vnpu_offload/include/spdlog/cfg/helpers.h
--- a/csrc/vnpu_offload/include/spdlog/common-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/common-inl.h
--- a/csrc/vnpu_offload/include/spdlog/common.h
+++ b/csrc/vnpu_offload/include/spdlog/common.h
--- a/csrc/vnpu_offload/include/spdlog/details/backtracer-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/backtracer-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/backtracer.h
+++ b/csrc/vnpu_offload/include/spdlog/details/backtracer.h
--- a/csrc/vnpu_offload/include/spdlog/details/circular_q.h
+++ b/csrc/vnpu_offload/include/spdlog/details/circular_q.h
--- a/csrc/vnpu_offload/include/spdlog/details/console_globals.h
+++ b/csrc/vnpu_offload/include/spdlog/details/console_globals.h
--- a/csrc/vnpu_offload/include/spdlog/details/file_helper-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/file_helper-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/file_helper.h
+++ b/csrc/vnpu_offload/include/spdlog/details/file_helper.h
--- a/csrc/vnpu_offload/include/spdlog/details/fmt_helper.h
+++ b/csrc/vnpu_offload/include/spdlog/details/fmt_helper.h
--- a/csrc/vnpu_offload/include/spdlog/details/log_msg-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/log_msg-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/log_msg.h
+++ b/csrc/vnpu_offload/include/spdlog/details/log_msg.h
--- a/csrc/vnpu_offload/include/spdlog/details/log_msg_buffer-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/log_msg_buffer-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/log_msg_buffer.h
+++ b/csrc/vnpu_offload/include/spdlog/details/log_msg_buffer.h
--- a/csrc/vnpu_offload/include/spdlog/details/mpmc_blocking_q.h
+++ b/csrc/vnpu_offload/include/spdlog/details/mpmc_blocking_q.h
--- a/csrc/vnpu_offload/include/spdlog/details/null_mutex.h
+++ b/csrc/vnpu_offload/include/spdlog/details/null_mutex.h
--- a/csrc/vnpu_offload/include/spdlog/details/os-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/os-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/os.h
+++ b/csrc/vnpu_offload/include/spdlog/details/os.h
--- a/csrc/vnpu_offload/include/spdlog/details/periodic_worker-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/periodic_worker-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/periodic_worker.h
+++ b/csrc/vnpu_offload/include/spdlog/details/periodic_worker.h
--- a/csrc/vnpu_offload/include/spdlog/details/registry-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/registry-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/registry.h
+++ b/csrc/vnpu_offload/include/spdlog/details/registry.h
--- a/csrc/vnpu_offload/include/spdlog/details/synchronous_factory.h
+++ b/csrc/vnpu_offload/include/spdlog/details/synchronous_factory.h
--- a/csrc/vnpu_offload/include/spdlog/details/tcp_client-windows.h
+++ b/csrc/vnpu_offload/include/spdlog/details/tcp_client-windows.h
--- a/csrc/vnpu_offload/include/spdlog/details/tcp_client.h
+++ b/csrc/vnpu_offload/include/spdlog/details/tcp_client.h
--- a/csrc/vnpu_offload/include/spdlog/details/thread_pool-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/details/thread_pool-inl.h
--- a/csrc/vnpu_offload/include/spdlog/details/thread_pool.h
+++ b/csrc/vnpu_offload/include/spdlog/details/thread_pool.h
--- a/csrc/vnpu_offload/include/spdlog/details/udp_client-windows.h
+++ b/csrc/vnpu_offload/include/spdlog/details/udp_client-windows.h
--- a/csrc/vnpu_offload/include/spdlog/details/udp_client.h
+++ b/csrc/vnpu_offload/include/spdlog/details/udp_client.h
--- a/csrc/vnpu_offload/include/spdlog/details/windows_include.h
+++ b/csrc/vnpu_offload/include/spdlog/details/windows_include.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bin_to_hex.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bin_to_hex.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/args.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/args.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/base.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/base.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/chrono.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/chrono.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/color.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/color.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/compile.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/compile.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/core.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/core.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/fmt.license.rst
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/fmt.license.rst
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/format-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/format-inl.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/format.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/format.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/os.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/os.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/ostream.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/ostream.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/printf.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/printf.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/ranges.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/ranges.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/std.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/std.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/bundled/xchar.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/bundled/xchar.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/chrono.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/chrono.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/compile.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/compile.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/fmt.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/fmt.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/ostr.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/ostr.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/ranges.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/ranges.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/std.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/std.h
--- a/csrc/vnpu_offload/include/spdlog/fmt/xchar.h
+++ b/csrc/vnpu_offload/include/spdlog/fmt/xchar.h
--- a/csrc/vnpu_offload/include/spdlog/formatter.h
+++ b/csrc/vnpu_offload/include/spdlog/formatter.h
--- a/csrc/vnpu_offload/include/spdlog/fwd.h
+++ b/csrc/vnpu_offload/include/spdlog/fwd.h
--- a/csrc/vnpu_offload/include/spdlog/logger-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/logger-inl.h
--- a/csrc/vnpu_offload/include/spdlog/logger.h
+++ b/csrc/vnpu_offload/include/spdlog/logger.h
--- a/csrc/vnpu_offload/include/spdlog/mdc.h
+++ b/csrc/vnpu_offload/include/spdlog/mdc.h
--- a/csrc/vnpu_offload/include/spdlog/pattern_formatter-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/pattern_formatter-inl.h
--- a/csrc/vnpu_offload/include/spdlog/pattern_formatter.h
+++ b/csrc/vnpu_offload/include/spdlog/pattern_formatter.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/android_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/android_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/ansicolor_sink-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/ansicolor_sink-inl.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/ansicolor_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/ansicolor_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/base_sink-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/base_sink-inl.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/base_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/base_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/basic_file_sink-inl.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/basic_file_sink-inl.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/basic_file_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/basic_file_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/callback_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/callback_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/daily_file_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/daily_file_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/dist_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/dist_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/dup_filter_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/dup_filter_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/hourly_file_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/hourly_file_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/kafka_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/kafka_sink.h
--- a/csrc/vnpu_offload/include/spdlog/sinks/mongo_sink.h
+++ b/csrc/vnpu_offload/include/spdlog/sinks/mongo_sink.h
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
starkwj	389030a8f8	add env vars & misc	2026-02-11 06:27:58 +00:00
starkwj	739d074b0c	update other platforms' Dockerfile	2026-01-23 03:24:25 +00:00
starkwj	2a571d8bc8	support multi npu partially	2026-01-09 04:36:39 +00:00
starkwj	fa0fb46853	fix reload return value	2026-01-07 07:42:30 +00:00
lumian	074ae28d6e	更新 README.md	2026-01-05 20:33:31 +08:00
starkwj	caf0289e1a	add Dockerfile and readme	2026-01-05 11:31:07 +00:00