add env vars & misc

This commit is contained in:
starkwj
2026-02-11 06:27:58 +00:00
parent 739d074b0c
commit 389030a8f8
128 changed files with 89 additions and 59 deletions

View File

@@ -78,11 +78,11 @@ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
if(SOC_VERSION STREQUAL "ASCEND310P3")
file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp)
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp)
else()
file(GLOB VLLM_ASCEND_SRC
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
endif()
@@ -95,7 +95,7 @@ include_directories(
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/include
)
set(

View File

@@ -59,14 +59,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \

View File

@@ -51,14 +51,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export SOC_VERSION=ASCEND310P3 && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \

View File

@@ -49,14 +49,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
export SOC_VERSION=ASCEND310P3 && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \

View File

@@ -60,14 +60,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \

View File

@@ -63,14 +63,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \

View File

@@ -62,14 +62,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
make install && make clean && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
ENV VLLM_ASCEND_ENABLE_NZ=0 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
VLLM_ASCEND_ENABLE_VNPU=1
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \

View File

@@ -27,6 +27,10 @@ docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
2. Start LLM services with this image, following the official usage instructions.
3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).
### Environment Variables
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
## Limitations

View File

@@ -19,8 +19,8 @@
#include <string>
#include <atomic>
#include "idle_offload/shm_worker.h"
#include "idle_offload/npu_helper.h"
#include "vnpu_offload/shm_worker.h"
#include "vnpu_offload/npu_helper.h"
extern "C" {
@@ -312,8 +312,9 @@ my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
if (!g_python_malloc_callback) {
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
throw std::runtime_error(
"my_malloc ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)
@@ -346,8 +347,9 @@ __attribute__((visibility("default"))) void
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
// get memory handle from the pointer
if (!g_python_free_callback) {
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
throw std::runtime_error(
"my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
__FILE__ + ":" + std::to_string(__LINE__));
}
// Acquire GIL (not in stable ABI officially, but often works)

View File

@@ -1,6 +1,6 @@
CXX := g++
TARGET := vllm_vnpu_daemon
SRCS := offload_daemon.cpp shm_manager.cpp
SRCS := vnpu_daemon.cpp shm_manager.cpp
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude

Some files were not shown because too many files have changed in this diff Show More