Compare commits

6 Commits

Author SHA1 Message Date
starkwj
389030a8f8 add env vars & misc 2026-02-11 06:27:58 +00:00
starkwj
739d074b0c update other platforms' Dockerfile 2026-01-23 03:24:25 +00:00
starkwj
2a571d8bc8 support multi npu partially 2026-01-09 04:36:39 +00:00
starkwj
fa0fb46853 fix reload return value 2026-01-07 07:42:30 +00:00
074ae28d6e 更新 README.md 2026-01-05 20:33:31 +08:00
starkwj
caf0289e1a add Dockerfile and readme 2026-01-05 11:31:07 +00:00
136 changed files with 907 additions and 355 deletions

View File

@@ -77,11 +77,12 @@ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
if(SOC_VERSION STREQUAL "ASCEND310P3") if(SOC_VERSION STREQUAL "ASCEND310P3")
file(GLOB VLLM_ASCEND_SRC file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp)
else() else()
file(GLOB VLLM_ASCEND_SRC file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp) ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
endif() endif()
@@ -94,7 +95,7 @@ include_directories(
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host ${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include ${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/include
) )
set( set(

View File

@@ -59,14 +59,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \ cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \ make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \ ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \ VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1 VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \

View File

@@ -51,9 +51,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export SOC_VERSION=ASCEND310P3 && \ export SOC_VERSION=ASCEND310P3 && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge python3 -m pip cache purge

View File

@@ -49,9 +49,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
export SOC_VERSION=ASCEND310P3 && \ export SOC_VERSION=ASCEND310P3 && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge python3 -m pip cache purge

View File

@@ -20,11 +20,13 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1 ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG=v0.3.7.post2 ARG MOONCAKE_TAG=v0.3.7.post2
ARG SOC_VERSION="ascend910_9391"
COPY . /vllm-workspace/vllm-ascend/ COPY . /vllm-workspace/vllm-ascend/
# Define environments # Define environments
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} \
SOC_VERSION=$SOC_VERSION
RUN pip config set global.index-url ${PIP_INDEX_URL} RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -58,9 +60,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge python3 -m pip cache purge

View File

@@ -20,8 +20,10 @@ FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1 ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2" ARG MOONCAKE_TAG="v0.3.7.post2"
ARG SOC_VERSION="ascend910_9391"
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS} \
SOC_VERSION=${SOC_VERSION}
RUN pip config set global.index-url ${PIP_INDEX_URL} RUN pip config set global.index-url ${PIP_INDEX_URL}
@@ -61,9 +63,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge python3 -m pip cache purge

View File

@@ -62,9 +62,15 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \ source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \ export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode) # Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \ RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge python3 -m pip cache purge

View File

@@ -0,0 +1,61 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-310p-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export SOC_VERSION=ASCEND310P3 && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -0,0 +1,59 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-310p-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
export SOC_VERSION=ASCEND310P3 && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -0,0 +1,68 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-a3-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG=v0.3.7.post2
COPY . /vllm-workspace/vllm-ascend/
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
# Install Mooncake dependencies
RUN apt-get update -y && \
apt-get install -y git vim wget net-tools gcc g++ cmake libnuma-dev && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
cd /vllm-workspace/Mooncake && bash mooncake_installer.sh -y && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/lib64 && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -0,0 +1,71 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-a3-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
cd /vllm-workspace/Mooncake && \
bash mooncake_installer.sh -y && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/yum/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -0,0 +1,72 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.3.rc2-910b-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ARG MOONCAKE_TAG="v0.3.7.post2"
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
SHELL ["/bin/bash", "-c"]
RUN yum update -y && \
yum install -y git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
git clone --depth 1 --branch ${MOONCAKE_TAG} https://github.com/kvcache-ai/Mooncake /vllm-workspace/Mooncake && \
cp /vllm-workspace/vllm-ascend/tools/mooncake_installer.sh /vllm-workspace/Mooncake/ && \
ARCH=$(uname -m) && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/devlib:/usr/local/Ascend/ascend-toolkit/latest/${ARCH}-linux/lib64:$LD_LIBRARY_PATH && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/${ARCH}-openEuler-linux && \
cd /vllm-workspace/Mooncake && \
bash mooncake_installer.sh -y && \
mkdir -p build && cd build && cmake .. -DUSE_ASCEND_DIRECT=ON && \
make -j$(nproc) && make install && \
rm -fr /vllm-workspace/Mooncake/build && \
rm -rf /var/cache/yum/*
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -1,18 +1,10 @@
# vLLM-Ascend Multi-LLM Serving Support # XC-LLM: A Specially Optimized LLM Inference Engine for ModelHub XC
## Overview ## Overview
This repository is a modified version of [vLLM-Ascend](https://github.com/vllm-project/vllm-ascend) designed to enable multiple large language models (LLMs) to share one Ascend NPU. The project is optimized based on the popular LLM inference project vLLM. The current version supports Ascend NPU (910B3 & 910B4).
The key feature of this project is efficient memory coordination, enabling multiple vLLM instances share and dynamically hold Ascend NPU's physical memory. One of the key features of this project is efficient memory coordination, enabling multiple vLLM instances share and dynamically hold Ascend NPU's physical memory. When an instance is idle, model parameters are offloaded to host memory. Upon a new inference request, the model parameters are quickly restored to the NPUs memory (if not exist), without the need to initialize the engine and load the model from scratch. As a result, from the applications perspective, multiple LLM inference engines can run on the NPU even when their total memory requirements exceed the physical memory limit. This technique is referred to as `InfiniVRAM`.
When an instance is idle, model parameters are offloaded to host memory.
Upon a new inference request, the model parameters are quickly restored to the NPUs memory (if not exist), without the need to init the engine and load the model from scratch. (For Qwen3-8B, It only causes 0.3s of additional latency to TTFT on a real restore.)
## Features
- **Shared NPU Usage**: Multiple vLLM instances can access the same Ascend NPU, allowing for multi-LLM serving of different LLMs.
- **Fast Memory Restore**: We decouple the virtual and physical memory allcation. Physical NPU memory is allocated and exported and shared to other LLM engines. LLM engines can restore quickly without reinitialize and memory allocation
## Installation ## Installation
@@ -33,19 +25,14 @@ docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
0. To share NPU, processes coordinate via shm, so you need to set all containers with `ipc=host`. 0. To share NPU, processes coordinate via shm, so you need to set all containers with `ipc=host`.
1. Start a daemon process in a standalone container, by running `vllm_vnpu_daemon` installed inside the image. 1. Start a daemon process in a standalone container, by running `vllm_vnpu_daemon` installed inside the image.
2. Start LLM services with this image, following the official usage instructions. 2. Start LLM services with this image, following the official usage instructions.
3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).
### Environment Variables
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
## Limitations ## Limitations
- This project only support share a single NPU currently. This is also limited by the fact that HCCL cannot be shared. We haven't figure out how to bypass HCCL. *Help wanted*. - Restricted by the fact that HCCL cannot be shared, deploying more than one model with multi-GPU (e.g., TP) is not feasible currently.
- The prefix cache will be reset when the LLM is restored, since we just simply discard the KV cache when the LLM is offloaded. - The prefix cache will be reset when the LLM is restored, since we just simply discard the KV cache when the LLM is offloaded.
## Roadmap
- [ ] Space-sharing.
- [ ] ...
## License
Apache License 2.0, as found in the [LICENSE](./LICENSE) file.

View File

@@ -19,7 +19,8 @@
#include <string> #include <string>
#include <atomic> #include <atomic>
#include "idle_offload/shm_worker.h" #include "vnpu_offload/shm_worker.h"
#include "vnpu_offload/npu_helper.h"
extern "C" { extern "C" {
@@ -311,7 +312,8 @@ my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle)); (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
if (!g_python_malloc_callback) { if (!g_python_malloc_callback) {
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." + throw std::runtime_error(
"my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__)); std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
} }
@@ -345,8 +347,9 @@ __attribute__((visibility("default"))) void
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) { my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
// get memory handle from the pointer // get memory handle from the pointer
if (!g_python_free_callback) { if (!g_python_free_callback) {
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." + throw std::runtime_error(
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__)); "my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
__FILE__ + ":" + std::to_string(__LINE__));
} }
// Acquire GIL (not in stable ABI officially, but often works) // Acquire GIL (not in stable ABI officially, but often works)
@@ -474,8 +477,10 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
static PyObject* py_init_module_offload(PyObject* self, PyObject* args) { static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
PyObject* malloc_callback = nullptr; PyObject* malloc_callback = nullptr;
PyObject* free_callback = nullptr; PyObject* free_callback = nullptr;
unsigned long long device = 0;
if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { if (!PyArg_ParseTuple(args, "OOK", &malloc_callback, &free_callback,
&device)) {
return nullptr; return nullptr;
} }
@@ -497,7 +502,13 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
} }
g_initialized.store(true); g_initialized.store(true);
shm_worker = new ShmWorker(); std::vector<int> gpu_ids = get_npu_ids();
if (device >= gpu_ids.size()) {
throw std::runtime_error("Invalid device id: " + std::to_string(device) +
" " + __FILE__ + ":" + std::to_string(__LINE__));
}
int gpu_id = gpu_ids[device];
// get pid // get pid
aclError error_code; aclError error_code;
int32_t pid; int32_t pid;
@@ -508,11 +519,12 @@ static PyObject* py_init_module_offload(PyObject* self, PyObject* args) {
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(error_code) + " " + __FILE__ + ":" +
std::to_string(__LINE__)); std::to_string(__LINE__));
} }
shm_worker = new ShmWorker();
uint64_t shareable_handle; uint64_t shareable_handle;
shm_worker->register_worker(pid, &shareable_handle, &g_size); shm_worker->register_worker(pid, gpu_id, &shareable_handle, &g_size);
// import shareable handle // import shareable handle
uint32_t device = 0;
aclrtDrvMemHandle memHandle; aclrtDrvMemHandle memHandle;
error_code = error_code =
aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle); aclrtMemImportFromShareableHandle(shareable_handle, device, &memHandle);
@@ -570,9 +582,16 @@ static PyObject* python_get_mem_info_offload(PyObject* self, PyObject* args) {
return tuple; return tuple;
} }
static PyObject* python_lock_gpu_offload(PyObject* self, PyObject* args) { static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
bool prev_is_self = shm_worker->lock_gpu(); bool prev_is_self = false;
return PyBool_FromLong(prev_is_self); bool success = shm_worker->try_lock_gpu(prev_is_self);
PyObject* tuple = PyTuple_New(2);
if (!tuple) {
return nullptr;
}
PyTuple_SetItem(tuple, 0, PyBool_FromLong(success));
PyTuple_SetItem(tuple, 1, PyBool_FromLong(prev_is_self));
return tuple;
} }
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) { static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
@@ -597,7 +616,7 @@ static PyMethodDef module_methods[] = {
"Unmap and release memory on the device."}, "Unmap and release memory on the device."},
{"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload, {"python_get_mem_info_offload", (PyCFunction)python_get_mem_info_offload,
METH_NOARGS, "Get mem info in the reserved pool."}, METH_NOARGS, "Get mem info in the reserved pool."},
{"python_lock_gpu_offload", (PyCFunction)python_lock_gpu_offload, {"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
METH_NOARGS, "Lock GPU."}, METH_NOARGS, "Lock GPU."},
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload, {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
METH_NOARGS, "Unlock GPU."}, METH_NOARGS, "Unlock GPU."},

View File

@@ -1,198 +0,0 @@
#include <iostream>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <atomic>
#include <signal.h>
#include "acl/acl.h"
#include "shm_manager.h"
#include "spdlog/spdlog.h"
static constexpr size_t reserved_mem_size = 8ul * 1024 * 1024 * 1024; // 8GB
static ShmManager *shm_manager = nullptr;
void handle_signal(int sig) {
if (shm_manager) {
shm_manager->stop_busy_loop();
}
}
void install_signal_handlers() {
struct sigaction sa{};
sa.sa_handler = handle_signal;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sigaction(SIGINT, &sa, nullptr);
sigaction(SIGTERM, &sa, nullptr);
sigaction(SIGHUP, &sa, nullptr);
}
void ensure_context(unsigned long long device) {
aclrtContext pctx;
aclrtGetCurrentContext(&pctx);
if (!pctx) {
// Ensure device context.
aclrtCreateContext(&pctx, device);
aclrtSetCurrentContext(pctx);
}
}
void init_acl() {
int32_t deviceId=0;
// aclrtStream stream;
bool g_isDevice;
aclError ret = aclrtSetDevice(deviceId);
if (ret != ACL_ERROR_NONE) {
throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__));
}
}
void reset_pids(const std::vector<int32_t> &pids, uint64_t shareable_handle) {
int cnt = pids.size();
if (cnt <= 0) {
return;
}
int32_t pids_data[cnt];
memcpy(pids_data, pids.data(), cnt * sizeof(int32_t));
aclError error_code =
aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt);
if (error_code != 0) {
spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemSetPidToShareableHandle failed");
} else {
spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}",
cnt);
}
}
void start_daemon() {
init_acl();
aclError error_code;
size_t free_mem = 0, total = 0;
error_code = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total);
if (error_code != 0) {
spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
throw std::runtime_error("aclrtGetMemInfo failed");
} else {
spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem,
total);
}
uint32_t device = 0;
aclrtPhysicalMemProp prop = {};
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
prop.memAttr = ACL_HBM_MEM_HUGE;
prop.location.id = device;
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
prop.reserve = 0;
size_t granularity;
error_code = aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, &granularity);
if (error_code != 0) {
spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}", error_code);
throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
} else {
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}",
granularity);
}
if (free_mem < reserved_mem_size) {
spdlog::error("Not enough free memory to reserve: {}, free_mem: {}",
reserved_mem_size, free_mem);
throw std::runtime_error("Not enough free memory to reserve");
}
size_t g_size = free_mem - reserved_mem_size;
g_size = (g_size / granularity) * granularity;
// allocate physical memory
aclrtDrvMemHandle mem_handle;
error_code = aclrtMallocPhysical(&mem_handle, g_size, &prop, 0);
if (error_code != 0) {
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtMallocPhysical failed");
} else {
spdlog::info("aclrtMallocPhysical succeeded, size: {}", g_size);
}
// // reserve address
// void *vmem_addr = nullptr;
// error_code = aclrtReserveMemAddress(&vmem_addr, g_size, 0, nullptr, 0);
// if (error_code != 0) {
// spdlog::error("aclrtReserveMemAddress failed, error_code: {}", error_code);
// throw std::runtime_error("aclrtReserveMemAddress failed");
// } else {
// spdlog::info("aclrtReserveMemAddress succeeded, vmem_addr: {}", vmem_addr);
// }
// // map
// error_code = aclrtMapMem(vmem_addr, g_size, 0, mem_handle, 0);
// if (error_code != 0) {
// spdlog::error("aclrtMapMem failed, error_code: {}", error_code);
// throw std::runtime_error("aclrtMapMem failed");
// } else {
// spdlog::info("aclrtMapMem succeeded, vmem_addr: {}", vmem_addr);
// }
// export
uint64_t shareable_handle;
error_code = aclrtMemExportToShareableHandle(
mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT,
&shareable_handle);
if (error_code != 0) {
spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemExportToShareableHandle failed");
} else {
spdlog::info(
"aclrtMemExportToShareableHandle succeeded, shareable_handle: {}",
shareable_handle);
}
// shm
shm_manager = new ShmManager();
shm_manager->set_gpu_info(g_size, shareable_handle);
shm_manager->register_callback_on_worker_change(
[&](const std::vector<int32_t> &pids) {
reset_pids(pids, shareable_handle);
});
// start busy loop
shm_manager->run_busy_loop();
// stopped by signal
delete shm_manager;
shm_manager = nullptr;
// free physical memory
error_code = aclrtFreePhysical(mem_handle);
if (error_code != 0) {
spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtFreePhysical failed");
}
}
int main() {
install_signal_handlers();
start_daemon();
return 0;
}

View File

@@ -1,12 +1,12 @@
CXX := g++ CXX := g++
TARGET := vllm_vnpu_daemon TARGET := vllm_vnpu_daemon
SRCS := offload_daemon.cpp shm_manager.cpp SRCS := vnpu_daemon.cpp shm_manager.cpp
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
LIBS := -L$(ASCEND_HOME)/lib64 -lascendcl LIBS := -L$(ASCEND_HOME)/lib64 -lascendcl
CXXFLAGS := $(INCLUDES) CXXFLAGS := $(INCLUDES) -O2
LDFLAGS := $(LIBS) LDFLAGS := $(LIBS)
PREFIX ?= /usr/local PREFIX ?= /usr/local

Some files were not shown because too many files have changed in this diff Show More