add env vars & misc
This commit is contained in:
@@ -78,11 +78,11 @@ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
|
||||
if(SOC_VERSION STREQUAL "ASCEND310P3")
|
||||
file(GLOB VLLM_ASCEND_SRC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp)
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp)
|
||||
else()
|
||||
file(GLOB VLLM_ASCEND_SRC
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
|
||||
endif()
|
||||
|
||||
@@ -95,7 +95,7 @@ include_directories(
|
||||
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
|
||||
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/include
|
||||
)
|
||||
|
||||
set(
|
||||
|
||||
@@ -59,14 +59,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -51,14 +51,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export SOC_VERSION=ASCEND310P3 && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -49,14 +49,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
export SOC_VERSION=ASCEND310P3 && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -60,14 +60,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -63,14 +63,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -62,14 +62,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
||||
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||
make install && make clean && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
||||
VLLM_ASCEND_ENABLE_VNPU=1
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||
|
||||
@@ -27,6 +27,10 @@ docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
|
||||
2. Start LLM services with this image, following the official usage instructions.
|
||||
3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).
|
||||
|
||||
### Environment Variables
|
||||
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
|
||||
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
|
||||
|
||||
|
||||
## Limitations
|
||||
|
||||
|
||||
@@ -19,8 +19,8 @@
|
||||
#include <string>
|
||||
#include <atomic>
|
||||
|
||||
#include "idle_offload/shm_worker.h"
|
||||
#include "idle_offload/npu_helper.h"
|
||||
#include "vnpu_offload/shm_worker.h"
|
||||
#include "vnpu_offload/npu_helper.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
@@ -312,8 +312,9 @@ my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
|
||||
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
||||
|
||||
if (!g_python_malloc_callback) {
|
||||
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
throw std::runtime_error(
|
||||
"my_malloc ERROR: g_python_malloc_callback not set." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
@@ -346,8 +347,9 @@ __attribute__((visibility("default"))) void
|
||||
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
|
||||
// get memory handle from the pointer
|
||||
if (!g_python_free_callback) {
|
||||
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
|
||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||
throw std::runtime_error(
|
||||
"my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
|
||||
__FILE__ + ":" + std::to_string(__LINE__));
|
||||
}
|
||||
|
||||
// Acquire GIL (not in stable ABI officially, but often works)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
CXX := g++
|
||||
TARGET := vllm_vnpu_daemon
|
||||
SRCS := offload_daemon.cpp shm_manager.cpp
|
||||
SRCS := vnpu_daemon.cpp shm_manager.cpp
|
||||
|
||||
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
|
||||
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user