Add feature: priority

Signed-off-by: Jing Wang <jingwang96@qq.com>
This commit is contained in:
Jing Wang
2026-05-12 11:51:57 +00:00
parent d627a45881
commit b6549b6e38
11 changed files with 382 additions and 66 deletions

View File

@@ -30,6 +30,7 @@ docker build -t $build_image -f ./Dockerfile .
### Environment Variables ### Environment Variables
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`. - `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`. - `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
- `VLLM_VNPU_PRIORITY`: The priority of LLM services. High-priority LLM services are prioritized when processing requests. The value must be an integer in the range [0, 7]. Default: `0`.
## Limitations ## Limitations

View File

@@ -595,10 +595,29 @@ static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
} }
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) { static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
shm_worker->unlock_gpu(); int keep_wait = 0;
if (!PyArg_ParseTuple(args, "|p", &keep_wait)) {
return NULL;
}
shm_worker->unlock_gpu(keep_wait != 0);
Py_RETURN_NONE; Py_RETURN_NONE;
} }
static PyObject* python_start_wait_offload(PyObject* self, PyObject* args) {
shm_worker->start_wait();
Py_RETURN_NONE;
}
static PyObject* python_cancel_wait_offload(PyObject* self, PyObject* args) {
shm_worker->cancel_wait();
Py_RETURN_NONE;
}
static PyObject* python_has_higher_priority_waiter_offload(PyObject* self, PyObject* args) {
bool has_higher = shm_worker->has_higher_priority_waiter();
return PyBool_FromLong(has_higher);
}
static PyMethodDef module_methods[] = { static PyMethodDef module_methods[] = {
{"init_module", (PyCFunction)py_init_module, METH_VARARGS, {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
"Initialize module with python_malloc and python_free callables."}, "Initialize module with python_malloc and python_free callables."},
@@ -619,7 +638,13 @@ static PyMethodDef module_methods[] = {
{"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload, {"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
METH_NOARGS, "Lock GPU."}, METH_NOARGS, "Lock GPU."},
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload, {"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
METH_NOARGS, "Unlock GPU."}, METH_VARARGS, "Unlock GPU."},
{"python_start_wait_offload", (PyCFunction)python_start_wait_offload,
METH_NOARGS, "Start waiting for GPU lock."},
{"python_cancel_wait_offload", (PyCFunction)python_cancel_wait_offload,
METH_NOARGS, "Cancel waiting for GPU lock."},
{"python_has_higher_priority_waiter_offload", (PyCFunction)python_has_higher_priority_waiter_offload,
METH_NOARGS, "Check if there is a higher priority waiter."},
{NULL, NULL, 0, NULL} // sentinel {NULL, NULL, 0, NULL} // sentinel
}; };

View File

@@ -16,8 +16,8 @@
#include "spdlog/spdlog.h" #include "spdlog/spdlog.h"
#define MAX_WORKERS 60 #define MAX_WORKERS 64
#define MAX_DEVICES 16 #define MAX_DEVICES 32
static inline std::string get_shm_name() { static inline std::string get_shm_name() {
const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME"); const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME");
@@ -34,7 +34,7 @@ static inline std::string get_shm_name() {
} }
static constexpr uint32_t heartbeat_us = 1000; // microseconds static constexpr uint32_t heartbeat_us = 1000; // microseconds
static constexpr uint32_t heartbeat_check_everyN = 50; static constexpr uint32_t heartbeat_check_everyN = 100;
static constexpr uint32_t heartbeat_timeout_us = static constexpr uint32_t heartbeat_timeout_us =
heartbeat_check_everyN * heartbeat_us; heartbeat_check_everyN * heartbeat_us;
@@ -52,6 +52,8 @@ static inline uint64_t heartbeat_ts_us() {
.count()); .count());
} }
// GPU flag layout (64 bits):
// [lock (1 bit) | reserved (31 bits) | tgid (32 bits)]
static inline uint32_t unpack_lock_field(uint64_t gpu_flag) { static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
return static_cast<uint32_t>(gpu_flag >> 32); return static_cast<uint32_t>(gpu_flag >> 32);
} }
@@ -68,16 +70,43 @@ static inline uint64_t pack_unlocked_tgid(int32_t tgid) {
return static_cast<uint64_t>(tgid); return static_cast<uint64_t>(tgid);
} }
// waiting_worker_flag layout (64 bits):
// [ device_id (5 bits) | priority (3 bits) | timestamp (24 bits) | tgid (32 bits)]
static inline uint32_t unpack_waiting_device_id(uint64_t flag) {
return static_cast<uint32_t>(flag >> 59);
}
static inline uint16_t unpack_waiting_priority(uint64_t flag) {
return static_cast<uint16_t>((flag >> 56) & 0x7);
}
static inline uint32_t unpack_waiting_timestamp_ms(uint64_t flag) {
return static_cast<uint32_t>((flag >> 32) & 0xFFFFFF);
}
static inline int32_t unpack_waiting_tgid(uint64_t flag) {
return static_cast<int32_t>(flag & 0xFFFFFFFF);
}
static inline uint64_t pack_waiting_flag(uint32_t device_id, uint16_t priority,
uint32_t timestamp, int32_t tgid) {
return (static_cast<uint64_t>(device_id & 0x1F) << 59) |
(static_cast<uint64_t>(priority & 0x7) << 56) |
(static_cast<uint64_t>(timestamp & 0xFFFFFF) << 32) |
(static_cast<uint64_t>(tgid) & 0xFFFFFFFF);
}
// mmap usually page-aligned // mmap usually page-aligned
struct alignas(64) ShmHelper { struct alignas(64) ShmHelper {
struct VramInfo { struct VramInfo {
uint64_t total_vmem_size; uint64_t total_vmem_size;
uint64_t shareable_handle; uint64_t shareable_handle;
}; };
VramInfo vram_info[MAX_DEVICES]; // support max 16 NPUs VramInfo vram_info[MAX_DEVICES]; // support max 32 devices
// GPU lock flag // GPU lock flag
std::atomic<uint64_t> gpu_flag[MAX_DEVICES]; std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
// uint8_t _padding1[64 - sizeof(std::atomic<uint64_t>)]; std::atomic<uint64_t> waiting_worker_flags[MAX_WORKERS];
// request // request
enum RequestType: uint32_t { enum RequestType: uint32_t {

View File

@@ -55,7 +55,7 @@ void ShmManager::run_busy_loop() {
while (!stop_loop_flag.load(std::memory_order_acquire)) { while (!stop_loop_flag.load(std::memory_order_acquire)) {
process_requests(); process_requests();
if (loop_cnt % heartbeat_check_everyN== 0) { if (loop_cnt % heartbeat_check_everyN == 0) {
check_heart_beats(); check_heart_beats();
} }
loop_cnt = (loop_cnt + 1) % heartbeat_check_everyN; loop_cnt = (loop_cnt + 1) % heartbeat_check_everyN;
@@ -152,6 +152,8 @@ void ShmManager::check_heart_beats() {
shm_helper->heart_beats[i].tgid = 0; shm_helper->heart_beats[i].tgid = 0;
shm_helper->heart_beats[i].timestamp.store(0, shm_helper->heart_beats[i].timestamp.store(0,
std::memory_order_release); std::memory_order_release);
// clear waiting flag
shm_helper->waiting_worker_flags[i].store(0, std::memory_order_release);
// check dead lock // check dead lock
for (int gpu_id : valid_gpu_ids) { for (int gpu_id : valid_gpu_ids) {
uint64_t gpu_flag = uint64_t gpu_flag =

View File

@@ -1,7 +1,29 @@
#include "shm_worker.h" #include "shm_worker.h"
static inline uint16_t get_shm_priority() {
const char *env_priority = getenv("VLLM_VNPU_PRIORITY");
if (env_priority) {
try {
int p = std::stoi(env_priority);
if (p >= 0 && p <= 7) {
return static_cast<uint16_t>(p);
} else {
spdlog::warn("VLLM_VNPU_PRIORITY should be between 0 and 7, got {}. Using default 0.", p);
}
} catch (...) {
spdlog::warn("Invalid VLLM_VNPU_PRIORITY format. Using default 0.");
}
}
return 0;
}
ShmWorker::ShmWorker() { ShmWorker::ShmWorker() {
this->priority = get_shm_priority();
this->waiting_timestamp = 0;
this->is_waiting = false;
this->is_holding_lock = false;
spdlog::info("vNPU worker initialized with priority {}", priority);
std::string shm_name = get_shm_name(); std::string shm_name = get_shm_name();
int shm_fd = shm_open(shm_name.c_str(), O_RDWR, 0666); int shm_fd = shm_open(shm_name.c_str(), O_RDWR, 0666);
if (shm_fd == -1) { if (shm_fd == -1) {
@@ -40,16 +62,18 @@ bool ShmWorker::register_worker(int32_t tgid, int gpu_id,
if (slot == -1) { if (slot == -1) {
return false; return false;
} }
this->shm_slot = slot;
*out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle; *out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle;
*out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size; *out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size;
stop_heart_beat.store(false, std::memory_order_release); stop_heart_beat.store(false, std::memory_order_release);
heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this, slot); heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this);
return true; return true;
} }
void ShmWorker::heart_beat_loop(int slot) { void ShmWorker::heart_beat_loop() {
int slot = this->shm_slot;
while (!stop_heart_beat.load(std::memory_order_acquire)) { while (!stop_heart_beat.load(std::memory_order_acquire)) {
// update heart beat // update heart beat
int32_t shm_tgid = int32_t shm_tgid =
@@ -64,6 +88,7 @@ void ShmWorker::heart_beat_loop(int slot) {
spdlog::error("TGID {} failed to re-register as worker", tgid); spdlog::error("TGID {} failed to re-register as worker", tgid);
throw std::runtime_error("Failed to re-register as worker"); throw std::runtime_error("Failed to re-register as worker");
} }
this->shm_slot = slot;
} }
uint64_t now = heartbeat_ts_us(); uint64_t now = heartbeat_ts_us();
shm_helper->heart_beats[slot].timestamp.store(now, shm_helper->heart_beats[slot].timestamp.store(now,
@@ -72,32 +97,95 @@ void ShmWorker::heart_beat_loop(int slot) {
} }
} }
void ShmWorker::start_wait() {
if (is_waiting) return; // Keep the older timestamp if already waiting
// Use lower 24 bits of millisecond timestamp
waiting_timestamp = static_cast<uint32_t>((heartbeat_ts_us() / 1000) & 0xFFFFFF);
uint64_t flag = pack_waiting_flag(this->gpu_id, this->priority, waiting_timestamp, this->tgid);
shm_helper->waiting_worker_flags[this->shm_slot].store(flag, std::memory_order_release);
is_waiting = true;
}
void ShmWorker::cancel_wait() {
if (!is_waiting) return;
shm_helper->waiting_worker_flags[this->shm_slot].store(0, std::memory_order_release);
is_waiting = false;
}
bool ShmWorker::has_higher_priority_waiter() {
for (int i = 0; i < MAX_WORKERS; ++i) {
if (i == this->shm_slot) continue;
uint64_t flag = shm_helper->waiting_worker_flags[i].load(std::memory_order_acquire);
if (flag == 0) continue;
if (unpack_waiting_device_id(flag) != this->gpu_id) continue;
uint16_t other_prio = unpack_waiting_priority(flag);
if (other_prio > this->priority) {
return true; // Found a waiter with higher priority
} else if (other_prio == this->priority) {
if (this->is_holding_lock) {
// doesn't need to yield to same priority waiters
continue;
}
if (!this->is_waiting) {
// an earlier waiter with the same priority
return true;
}
uint32_t other_ts = unpack_waiting_timestamp_ms(flag);
// Same priority, compare timestamps (handle 24-bit wrap-around)
// Using 24-bit unsigned subtraction. If the difference is in the lower half,
// my timestamp is greater (i.e., I started waiting later).
uint32_t diff = (this->waiting_timestamp - other_ts) & 0xFFFFFF;
if (diff > 0 && diff < 0x800000) {
return true; // The other worker started waiting earlier
} else if (diff == 0 && unpack_waiting_tgid(flag) < this->tgid) {
// using tgid if timestamps happen to be exactly the same
return true;
}
}
}
return false;
}
bool ShmWorker::try_lock_gpu(bool &out_self_hold) { bool ShmWorker::try_lock_gpu(bool &out_self_hold) {
static int retry_cnt = 0; static int retry_cnt = 0;
uint64_t old_flag = uint64_t old_flag =
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire); shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
if (unpack_lock_field(old_flag) == 0) { // free if (unpack_lock_field(old_flag) == 0) { // free
// Check priority: yield if there are higher priority waiters, or same priority waiters who have waited longer.
if (has_higher_priority_waiter()) {
out_self_hold = false;
return false;
}
uint64_t new_flag = pack_locked_tgid(tgid); uint64_t new_flag = pack_locked_tgid(tgid);
if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak( if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak(
old_flag, new_flag, std::memory_order_acq_rel, old_flag, new_flag, std::memory_order_acq_rel,
std::memory_order_acquire)) { std::memory_order_acquire)) {
spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id); // spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
int32_t prev_tgid = unpack_tgid_field(old_flag); int32_t prev_tgid = unpack_tgid_field(old_flag);
out_self_hold = prev_tgid == tgid; out_self_hold = prev_tgid == tgid;
retry_cnt = 0; retry_cnt = 0;
this->is_holding_lock = true;
return true; return true;
} }
} else { // locked } else { // locked
if (unpack_tgid_field(old_flag) == tgid) { if (unpack_tgid_field(old_flag) == tgid) {
spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id); // spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
out_self_hold = true; out_self_hold = true;
retry_cnt = 0; retry_cnt = 0;
this->is_holding_lock = true;
return true; return true;
} }
} }
// failed // failed
if (++retry_cnt % 2000 == 0) { if (++retry_cnt % 10000 == 0) {
spdlog::info( spdlog::info(
"TGID {} trying to acquire GPU {} lock, current lock holder TGID {}", "TGID {} trying to acquire GPU {} lock, current lock holder TGID {}",
tgid, gpu_id, unpack_tgid_field(old_flag)); tgid, gpu_id, unpack_tgid_field(old_flag));
@@ -116,19 +204,23 @@ bool ShmWorker::lock_gpu(bool &out_self_hold) {
} }
} }
void ShmWorker::unlock_gpu() { void ShmWorker::unlock_gpu(bool keep_wait) {
if (!keep_wait) {
cancel_wait();
}
uint64_t old_flag = uint64_t old_flag =
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire); shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
if (unpack_tgid_field(old_flag) != tgid) { if (unpack_tgid_field(old_flag) != tgid) {
// spdlog::warn("previous gpu flag {} does not match expected locked flag for " if (!keep_wait) {
// "TGID {}. This may be a bug, unless during startup.", spdlog::info("unlock: TGID {} does not hold GPU {} lock", tgid, gpu_id);
// old_flag, tgid); }
spdlog::info("TGID {} does not hold GPU {} lock", tgid, gpu_id);
} else { } else {
uint64_t new_flag = pack_unlocked_tgid(tgid); uint64_t new_flag = pack_unlocked_tgid(tgid);
shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release); shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release);
spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id); // spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
} }
this->is_holding_lock = false;
} }
uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) { uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) {

View File

@@ -17,11 +17,21 @@ class ShmWorker {
bool try_lock_gpu(bool &out_self_hold); bool try_lock_gpu(bool &out_self_hold);
bool lock_gpu(bool &out_self_hold); bool lock_gpu(bool &out_self_hold);
void unlock_gpu(); void unlock_gpu(bool keep_wait = false);
bool has_higher_priority_waiter();
void start_wait();
void cancel_wait();
private: private:
int32_t tgid; int32_t tgid;
int gpu_id; int gpu_id;
int shm_slot;
uint16_t priority;
uint32_t waiting_timestamp;
bool is_waiting;
bool is_holding_lock;
ShmHelper *shm_helper; ShmHelper *shm_helper;
std::thread heart_beat_thread; std::thread heart_beat_thread;
std::atomic<bool> stop_heart_beat; std::atomic<bool> stop_heart_beat;
@@ -31,5 +41,5 @@ class ShmWorker {
int register_worker_shm(); int register_worker_shm();
// heart beat // heart beat
void heart_beat_loop(int slot); void heart_beat_loop();
}; };

View File

@@ -1,25 +1,29 @@
#include <iostream> #include <iostream>
#include <sys/types.h> #include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <atomic> #include <atomic>
#include <fcntl.h>
#include <mutex> #include <mutex>
#include <signal.h> #include <signal.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <vector>
#include "acl/acl.h" #include "acl/acl.h"
#include "shm_manager.h"
#include "npu_helper.h" #include "npu_helper.h"
#include "shm_manager.h"
#include "spdlog/spdlog.h" #include "spdlog/spdlog.h"
static ShmManager *shm_manager = nullptr; static ShmManager *shm_manager = nullptr;
static inline double TO_GB(size_t bytes) {
return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0);
}
void handle_signal(int sig) { void handle_signal(int sig) {
if (shm_manager) { if (shm_manager) {
shm_manager->stop_busy_loop(); shm_manager->stop_busy_loop();
@@ -49,7 +53,7 @@ size_t get_reserved_vram_size() {
reserved_vram_size = size_gb * 1024 * 1024 * 1024; reserved_vram_size = size_gb * 1024 * 1024 * 1024;
} catch (const std::exception &e) { } catch (const std::exception &e) {
spdlog::warn("Failed to parse VNPU_RESERVED_VRAM_SIZE_GB: {}, using " spdlog::warn("Failed to parse VNPU_RESERVED_VRAM_SIZE_GB: {}, using "
"default 8GB", "default 8 GB",
e.what()); e.what());
} }
} }
@@ -68,12 +72,13 @@ void ensure_context(unsigned long long device) {
} }
void init_acl() { void init_acl() {
int32_t deviceId=0; int32_t deviceId = 0;
aclError ret = aclrtSetDevice(deviceId); aclError ret = aclrtSetDevice(deviceId);
if (ret != ACL_ERROR_NONE) { if (ret != ACL_ERROR_NONE) {
throw std::runtime_error("aclrtSetDevice failed with acl error code: " + throw std::runtime_error(
std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); "aclrtSetDevice failed with acl error code: " + std::to_string(ret) +
" " + __FILE__ + ":" + std::to_string(__LINE__));
} }
} }
@@ -109,8 +114,9 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code); spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
throw std::runtime_error("aclrtGetMemInfo failed"); throw std::runtime_error("aclrtGetMemInfo failed");
} else { } else {
spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem, spdlog::info(
total); "aclrtGetMemInfo succeeded, free_mem: {:.2f} GB, total: {:.2f} GB",
TO_GB(free_mem), TO_GB(total));
} }
aclrtPhysicalMemProp prop = {}; aclrtPhysicalMemProp prop = {};
@@ -129,13 +135,14 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
error_code); error_code);
throw std::runtime_error("aclrtMemGetAllocationGranularity failed"); throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
} else { } else {
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}", spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {} bytes",
granularity); granularity);
} }
size_t reserved_mem_size = get_reserved_vram_size(); size_t reserved_mem_size = get_reserved_vram_size();
if (free_mem < reserved_mem_size) { if (free_mem < reserved_mem_size) {
spdlog::error("Not enough free memory to reserve: {}, free_mem: {}", spdlog::error(
reserved_mem_size, free_mem); "Not enough free memory to reserve: {:.2f} GB, free_mem: {:.2f} GB",
TO_GB(reserved_mem_size), TO_GB(free_mem));
throw std::runtime_error("Not enough free memory to reserve"); throw std::runtime_error("Not enough free memory to reserve");
} }
out_g_size = free_mem - reserved_mem_size; out_g_size = free_mem - reserved_mem_size;
@@ -147,8 +154,8 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code); spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtMallocPhysical failed"); throw std::runtime_error("aclrtMallocPhysical failed");
} else { } else {
spdlog::info("device {} aclrtMallocPhysical succeeded, size: {}", device_id, spdlog::info("device {} aclrtMallocPhysical succeeded, size: {:.2f} GB",
out_g_size); device_id, TO_GB(out_g_size));
} }
} }

View File

@@ -62,7 +62,10 @@ try:
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release, python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
python_get_mem_info_offload as python_get_mem_info, python_get_mem_info_offload as python_get_mem_info,
python_try_lock_gpu_offload as python_try_lock_gpu, python_try_lock_gpu_offload as python_try_lock_gpu,
python_unlock_gpu_offload as python_unlock_gpu python_unlock_gpu_offload as python_unlock_gpu,
python_start_wait_offload as python_start_wait,
python_cancel_wait_offload as python_cancel_wait,
python_has_higher_priority_waiter_offload as python_has_higher_priority_waiter
) )
else: else:
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401 from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
@@ -73,6 +76,9 @@ try:
python_get_mem_info = None python_get_mem_info = None
python_try_lock_gpu = None python_try_lock_gpu = None
python_unlock_gpu = None python_unlock_gpu = None
python_start_wait = None
python_cancel_wait = None
python_has_higher_priority_waiter = None
lib_name = find_loaded_library("vllm_ascend_C") lib_name = find_loaded_library("vllm_ascend_C")
camem_available = True camem_available = True
@@ -84,6 +90,9 @@ except ImportError as e:
python_get_mem_info = None python_get_mem_info = None
python_try_lock_gpu = None python_try_lock_gpu = None
python_unlock_gpu = None python_unlock_gpu = None
python_start_wait = None
python_cancel_wait = None
python_has_higher_priority_waiter = None
lib_name = None lib_name = None
libcudart = None libcudart = None
@@ -306,15 +315,37 @@ class CaMemAllocator:
return False, False return False, False
def _vnpu_lock_gpu(self) -> bool: def _vnpu_lock_gpu(self) -> bool:
is_waiting = False
while True: while True:
success, _ = self.vnpu_try_lock_gpu() success, _ = self.vnpu_try_lock_gpu()
if success: if success:
if is_waiting:
self.vnpu_cancel_wait()
return True return True
else:
if not is_waiting:
self.vnpu_start_wait()
is_waiting = True
self.vnpu_unlock_gpu(keep_wait=True)
time.sleep(0.001) time.sleep(0.001)
def vnpu_unlock_gpu(self): def vnpu_unlock_gpu(self, keep_wait: bool = False):
if python_unlock_gpu: if python_unlock_gpu:
python_unlock_gpu() python_unlock_gpu(keep_wait)
def vnpu_start_wait(self) -> None:
if python_start_wait:
python_start_wait()
def vnpu_cancel_wait(self) -> None:
if python_cancel_wait:
python_cancel_wait()
def vnpu_has_higher_priority_waiter(self) -> bool:
if python_has_higher_priority_waiter:
return python_has_higher_priority_waiter()
return False
def get_pool_mem_info(self) -> tuple[int, int]: def get_pool_mem_info(self) -> tuple[int, int]:
""" """

View File

@@ -1,6 +1,8 @@
from concurrent.futures import Future
from logging import DEBUG from logging import DEBUG
import signal
import queue import queue
import signal
import time
from vllm.config import ParallelConfig, VllmConfig from vllm.config import ParallelConfig, VllmConfig
from vllm.logger import logger from vllm.logger import logger
@@ -98,8 +100,13 @@ def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
if engine_core is not None: if engine_core is not None:
engine_core.shutdown() engine_core.shutdown()
def run_busy_loop(self): def run_busy_loop(self):
"""Core busy loop of the EngineCore.""" """Core busy loop of the EngineCore."""
# vnpu yield
yield_probe_counter = 0
prepared_yield = False
while self._handle_shutdown(): while self._handle_shutdown():
# 1) Poll the input queue until there is work to do. # 1) Poll the input queue until there is work to do.
self._process_input_queue() self._process_input_queue()
@@ -111,14 +118,52 @@ def run_busy_loop(self):
prev_is_self = self.model_executor.reload_vram() prev_is_self = self.model_executor.reload_vram()
if not prev_is_self: if not prev_is_self:
self.reset_prefix_cache() self.reset_prefix_cache()
# 2) Step the engine core and return the outputs. # 2) Step the engine core and return the outputs.
self._process_engine_step() self._process_engine_step()
if (
envs_ascend.VLLM_ASCEND_ENABLE_VNPU if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
and not self.has_work() if not self.has_work():
and not self.model_executor.is_offloaded() if not self.model_executor.is_offloaded():
): self.model_executor.offload_vram()
self.model_executor.offload_vram() elif not prepared_yield:
# check should yield every 10 steps
yield_probe_counter = (yield_probe_counter + 1) % 10
if yield_probe_counter == 0:
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
if should_yield:
logger.info(
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
)
prepared_yield = True
pause_future = self.pause_scheduler(
mode="wait", clear_cache=True
)
def pause_complete(f: Future):
nonlocal prepared_yield
try:
if f:
f.result()
if not self.model_executor.is_offloaded():
self.model_executor.offload_vram(is_yield=True)
prepared_yield = False
logger.info("Current engine has yielded.")
# Scheduler should wake up itself after yielding.
# Sleep some time to give chance to other worker.
time.sleep(2)
self.resume_scheduler()
except Exception as e:
logger.exception("Failed to yield: {e}.")
raise e
if pause_future is None:
# pause finished, no in-flight requests
pause_complete(None)
else:
# pause_future will be set after all in-flight
# requests are finished in _process_input_queue
pause_future.add_done_callback(pause_complete)
raise SystemExit raise SystemExit
@@ -142,7 +187,8 @@ def _process_input_queue(self):
and not self.model_executor.is_offloaded() and not self.model_executor.is_offloaded()
): ):
self.model_executor.offload_vram() self.model_executor.offload_vram()
block = self.process_input_queue_block # vNPU: if scheduler is resumed and has work, should not block
block = self.process_input_queue_block and not self.has_work()
try: try:
req = self.input_queue.get(block=block) req = self.input_queue.get(block=block)
self._handle_client_request(*req) self._handle_client_request(*req)
@@ -169,6 +215,9 @@ EngineCoreProc._process_input_queue = _process_input_queue
def DPEngineCoreProc_run_busy_loop(self): def DPEngineCoreProc_run_busy_loop(self):
"""Core busy loop of the EngineCore for data parallel case.""" """Core busy loop of the EngineCore for data parallel case."""
# vnpu yield
yield_probe_counter = 0
prepared_yield = False
# Loop until process is sent a SIGINT or SIGTERM # Loop until process is sent a SIGINT or SIGTERM
while self._handle_shutdown(): while self._handle_shutdown():
@@ -226,13 +275,49 @@ def DPEngineCoreProc_run_busy_loop(self):
# Increment wave count and reset step counter. # Increment wave count and reset step counter.
self.current_wave += 1 self.current_wave += 1
self.step_counter = 0 self.step_counter = 0
if ( if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
envs_ascend.VLLM_ASCEND_ENABLE_VNPU if not self.has_work():
and not self.has_work() if not self.model_executor.is_offloaded():
and not self.model_executor.is_offloaded() self.model_executor.offload_vram()
): elif not prepared_yield:
self.model_executor.offload_vram() # check should yield every 10 steps
yield_probe_counter = (yield_probe_counter + 1) % 10
if yield_probe_counter == 0:
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
if should_yield:
logger.info(
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
)
prepared_yield = True
pause_future = self.pause_scheduler(
mode="wait", clear_cache=True
)
def pause_complete(f: Future):
nonlocal prepared_yield
try:
if f:
f.result()
if not self.model_executor.is_offloaded():
self.model_executor.offload_vram(is_yield=True)
prepared_yield = False
logger.info("Current engine has yielded.")
# Scheduler should wake up itself after yielding.
# Sleep some time to give chance to other worker.
time.sleep(2)
self.resume_scheduler()
except Exception as e:
logger.exception("Failed to yield: {e}.")
raise e
if pause_future is None:
# pause finished, no in-flight requests
pause_complete(None)
else:
# pause_future will be set after all in-flight
# requests are finished in _process_input_queue
pause_future.add_done_callback(pause_complete)
raise SystemExit raise SystemExit

View File

@@ -8,7 +8,12 @@ def is_offloaded(self) -> bool:
self._is_offloaded = False self._is_offloaded = False
return self._is_offloaded return self._is_offloaded
def offload_vram(self): def is_yielded(self) -> bool:
if not hasattr(self, "_is_yielded"):
self._is_yielded = False
return self._is_yielded
def offload_vram(self, is_yield: bool = False):
if self.is_offloaded(): if self.is_offloaded():
logger.warning("Executor is already offloaded.") logger.warning("Executor is already offloaded.")
return return
@@ -17,14 +22,18 @@ def offload_vram(self):
time_after_offload = time.perf_counter() time_after_offload = time.perf_counter()
self._is_offloaded = True self._is_offloaded = True
logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.") if is_yield:
self._is_yielded = True
logger.info(
f"Offloading VRAM costs {time_after_offload - time_before_offload:.3f} seconds."
)
def reload_vram(self) -> bool: def reload_vram(self) -> bool:
if not self.is_offloaded(): if not self.is_offloaded():
logger.warning("Executor is not offloaded.") logger.warning("Executor is not offloaded.")
return True return True
is_waiting = False
while True: while True:
time_before_reload = time.perf_counter() time_before_reload = time.perf_counter()
res = self.collective_rpc("try_reload_vram") res = self.collective_rpc("try_reload_vram")
@@ -33,15 +42,28 @@ def reload_vram(self) -> bool:
succ = all(x[0] for x in res) succ = all(x[0] for x in res)
if succ: if succ:
self._is_offloaded = False self._is_offloaded = False
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.") self._is_yielded = False
prev_is_self = all(x[1] for x in res) prev_is_self = all(x[1] for x in res)
if is_waiting:
self.collective_rpc("vnpu_cancel_wait")
logger.info(
f"Reloading VRAM costs {time_after_reload - time_before_reload:.3f} seconds."
)
return prev_is_self return prev_is_self
else: else:
# some workers not get lock # some workers not get lock
self.collective_rpc("vnpu_unlock_gpu") if not is_waiting:
self.collective_rpc("vnpu_start_wait")
is_waiting = True
self.collective_rpc("vnpu_unlock_gpu", kwargs={"keep_wait": True})
time.sleep(0.001) time.sleep(0.001)
def vnpu_has_higher_priority_waiter(self) -> bool:
res = self.collective_rpc("vnpu_has_higher_priority_waiter")
return any(res)
Executor.is_offloaded = is_offloaded Executor.is_offloaded = is_offloaded
Executor.offload_vram = offload_vram Executor.offload_vram = offload_vram
Executor.reload_vram = reload_vram Executor.reload_vram = reload_vram
Executor.vnpu_has_higher_priority_waiter = vnpu_has_higher_priority_waiter

View File

@@ -470,7 +470,7 @@ class NPUWorker(WorkerBase):
# save memory to host with lock # save memory to host with lock
self.offload_vram() self.offload_vram()
succ, _ = self.try_reload_vram() succ, _ = self.try_reload_vram()
assert succ, "Failed to reload model weights after offloading." # assert succ, "Failed to reload model weights after offloading."
def offload_vram(self) -> None: def offload_vram(self) -> None:
allocator = CaMemAllocator.get_instance() allocator = CaMemAllocator.get_instance()
@@ -480,9 +480,21 @@ class NPUWorker(WorkerBase):
allocator = CaMemAllocator.get_instance() allocator = CaMemAllocator.get_instance()
return allocator.try_reload_vram(tags=None) return allocator.try_reload_vram(tags=None)
def vnpu_unlock_gpu(self) -> None: def vnpu_unlock_gpu(self, keep_wait: bool = False) -> None:
allocator = CaMemAllocator.get_instance() allocator = CaMemAllocator.get_instance()
allocator.vnpu_unlock_gpu() allocator.vnpu_unlock_gpu(keep_wait)
def vnpu_start_wait(self) -> None:
allocator = CaMemAllocator.get_instance()
allocator.vnpu_start_wait()
def vnpu_cancel_wait(self) -> None:
allocator = CaMemAllocator.get_instance()
allocator.vnpu_cancel_wait()
def vnpu_has_higher_priority_waiter(self) -> bool:
allocator = CaMemAllocator.get_instance()
return allocator.vnpu_has_higher_priority_waiter()
def compile_or_warm_up_model(self) -> float: def compile_or_warm_up_model(self) -> float:
# Note: need to adapt for graph mode. # Note: need to adapt for graph mode.