add env vars & misc
This commit is contained in:
@@ -78,11 +78,11 @@ message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
|
|||||||
if(SOC_VERSION STREQUAL "ASCEND310P3")
|
if(SOC_VERSION STREQUAL "ASCEND310P3")
|
||||||
file(GLOB VLLM_ASCEND_SRC
|
file(GLOB VLLM_ASCEND_SRC
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp)
|
||||||
else()
|
else()
|
||||||
file(GLOB VLLM_ASCEND_SRC
|
file(GLOB VLLM_ASCEND_SRC
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/shm_worker.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/shm_worker.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host/tiling/tiling_data.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@@ -95,7 +95,7 @@ include_directories(
|
|||||||
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
|
${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
|
||||||
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
|
${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/batch_matmul_transpose/op_host
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/csrc/idle_offload/include
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/vnpu_offload/include
|
||||||
)
|
)
|
||||||
|
|
||||||
set(
|
set(
|
||||||
|
|||||||
@@ -59,14 +59,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -51,14 +51,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
export SOC_VERSION=ASCEND310P3 && \
|
export SOC_VERSION=ASCEND310P3 && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -49,14 +49,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||||
export SOC_VERSION=ASCEND310P3 && \
|
export SOC_VERSION=ASCEND310P3 && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -60,14 +60,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -63,14 +63,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -62,14 +62,14 @@ RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
|
|||||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||||
cd /vllm-workspace/vllm-ascend/csrc/idle_offload && \
|
cd /vllm-workspace/vllm-ascend/csrc/vnpu_offload && \
|
||||||
make install && make clean && \
|
make install && make clean && \
|
||||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
ENV VLLM_ASCEND_ENABLE_NZ=0 \
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
VLLM_ASCEND_ENABLE_IDLE_OFFLOAD=1
|
VLLM_ASCEND_ENABLE_VNPU=1
|
||||||
|
|
||||||
# Install modelscope (for fast download) and ray (for multinode)
|
# Install modelscope (for fast download) and ray (for multinode)
|
||||||
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
RUN python3 -m pip install modelscope 'ray>=2.47.1,<=2.48.0' 'protobuf>3.20.0' && \
|
||||||
|
|||||||
@@ -27,6 +27,10 @@ docker build -t vllm-ascend-multi-llm:latest -f ./Dockerfile .
|
|||||||
2. Start LLM services with this image, following the official usage instructions.
|
2. Start LLM services with this image, following the official usage instructions.
|
||||||
3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).
|
3. Due to the limited stream resource of Ascend NPU, you may need to restrict graph capture sizes or disable ACLgraph by setting `--enforce-eager`, especially when launching multiple LLMs. Refer to the [link](https://docs.vllm.ai/projects/ascend/en/latest/faqs.html#how-to-troubleshoot-and-resolve-size-capture-failures-resulting-from-stream-resource-exhaustion-and-what-are-the-underlying-causes).
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
|
||||||
|
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
|
||||||
|
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
|
|||||||
@@ -19,8 +19,8 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
||||||
#include "idle_offload/shm_worker.h"
|
#include "vnpu_offload/shm_worker.h"
|
||||||
#include "idle_offload/npu_helper.h"
|
#include "vnpu_offload/npu_helper.h"
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
|
||||||
@@ -312,8 +312,9 @@ my_malloc_offload(ssize_t size, int device, aclrtStream stream) {
|
|||||||
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
|
||||||
|
|
||||||
if (!g_python_malloc_callback) {
|
if (!g_python_malloc_callback) {
|
||||||
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
|
throw std::runtime_error(
|
||||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
"my_malloc ERROR: g_python_malloc_callback not set." +
|
||||||
|
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Acquire GIL (not in stable ABI officially, but often works)
|
// Acquire GIL (not in stable ABI officially, but often works)
|
||||||
@@ -346,8 +347,9 @@ __attribute__((visibility("default"))) void
|
|||||||
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
|
my_free_offload(void *ptr, ssize_t size, int device, aclrtStream stream) {
|
||||||
// get memory handle from the pointer
|
// get memory handle from the pointer
|
||||||
if (!g_python_free_callback) {
|
if (!g_python_free_callback) {
|
||||||
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
|
throw std::runtime_error(
|
||||||
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
|
"my_free ERROR: g_python_malloc_callback not set." + std::string(" ") +
|
||||||
|
__FILE__ + ":" + std::to_string(__LINE__));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Acquire GIL (not in stable ABI officially, but often works)
|
// Acquire GIL (not in stable ABI officially, but often works)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
CXX := g++
|
CXX := g++
|
||||||
TARGET := vllm_vnpu_daemon
|
TARGET := vllm_vnpu_daemon
|
||||||
SRCS := offload_daemon.cpp shm_manager.cpp
|
SRCS := vnpu_daemon.cpp shm_manager.cpp
|
||||||
|
|
||||||
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
|
ASCEND_HOME := /usr/local/Ascend/ascend-toolkit/latest
|
||||||
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
|
INCLUDES := -I$(ASCEND_HOME)/include -Iinclude
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user