@@ -30,6 +30,7 @@ docker build -t $build_image -f ./Dockerfile .
|
|||||||
### Environment Variables
|
### Environment Variables
|
||||||
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
|
- `VNPU_RESERVED_VRAM_SIZE_GB`: The amonut of reserved GPU memory for other miscellaneous memory. Only needs to be set for `vllm_vnpu_daemon`. Try increasing the variable if you launch multiple LLM services and encounter OOM. Default: `8`.
|
||||||
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
|
- `VLLM_VNPU_SHM_NAME`: The name of the shm file. Needs to be set for all containers of the shared vNPU group. Default: `/vllm_acl_vnpu_offload_shm`.
|
||||||
|
- `VLLM_VNPU_PRIORITY`: The priority of LLM services. High-priority LLM services are prioritized when processing requests. The value must be an integer in the range [0, 7]. Default: `0`.
|
||||||
|
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|||||||
@@ -595,10 +595,29 @@ static PyObject* python_try_lock_gpu_offload(PyObject* self, PyObject* args) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
|
static PyObject* python_unlock_gpu_offload(PyObject* self, PyObject* args) {
|
||||||
shm_worker->unlock_gpu();
|
int keep_wait = 0;
|
||||||
|
if (!PyArg_ParseTuple(args, "|p", &keep_wait)) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
shm_worker->unlock_gpu(keep_wait != 0);
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject* python_start_wait_offload(PyObject* self, PyObject* args) {
|
||||||
|
shm_worker->start_wait();
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject* python_cancel_wait_offload(PyObject* self, PyObject* args) {
|
||||||
|
shm_worker->cancel_wait();
|
||||||
|
Py_RETURN_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject* python_has_higher_priority_waiter_offload(PyObject* self, PyObject* args) {
|
||||||
|
bool has_higher = shm_worker->has_higher_priority_waiter();
|
||||||
|
return PyBool_FromLong(has_higher);
|
||||||
|
}
|
||||||
|
|
||||||
static PyMethodDef module_methods[] = {
|
static PyMethodDef module_methods[] = {
|
||||||
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
|
{"init_module", (PyCFunction)py_init_module, METH_VARARGS,
|
||||||
"Initialize module with python_malloc and python_free callables."},
|
"Initialize module with python_malloc and python_free callables."},
|
||||||
@@ -619,7 +638,13 @@ static PyMethodDef module_methods[] = {
|
|||||||
{"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
|
{"python_try_lock_gpu_offload", (PyCFunction)python_try_lock_gpu_offload,
|
||||||
METH_NOARGS, "Lock GPU."},
|
METH_NOARGS, "Lock GPU."},
|
||||||
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
|
{"python_unlock_gpu_offload", (PyCFunction)python_unlock_gpu_offload,
|
||||||
METH_NOARGS, "Unlock GPU."},
|
METH_VARARGS, "Unlock GPU."},
|
||||||
|
{"python_start_wait_offload", (PyCFunction)python_start_wait_offload,
|
||||||
|
METH_NOARGS, "Start waiting for GPU lock."},
|
||||||
|
{"python_cancel_wait_offload", (PyCFunction)python_cancel_wait_offload,
|
||||||
|
METH_NOARGS, "Cancel waiting for GPU lock."},
|
||||||
|
{"python_has_higher_priority_waiter_offload", (PyCFunction)python_has_higher_priority_waiter_offload,
|
||||||
|
METH_NOARGS, "Check if there is a higher priority waiter."},
|
||||||
{NULL, NULL, 0, NULL} // sentinel
|
{NULL, NULL, 0, NULL} // sentinel
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -16,8 +16,8 @@
|
|||||||
#include "spdlog/spdlog.h"
|
#include "spdlog/spdlog.h"
|
||||||
|
|
||||||
|
|
||||||
#define MAX_WORKERS 60
|
#define MAX_WORKERS 64
|
||||||
#define MAX_DEVICES 16
|
#define MAX_DEVICES 32
|
||||||
|
|
||||||
static inline std::string get_shm_name() {
|
static inline std::string get_shm_name() {
|
||||||
const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME");
|
const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME");
|
||||||
@@ -34,7 +34,7 @@ static inline std::string get_shm_name() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static constexpr uint32_t heartbeat_us = 1000; // microseconds
|
static constexpr uint32_t heartbeat_us = 1000; // microseconds
|
||||||
static constexpr uint32_t heartbeat_check_everyN = 50;
|
static constexpr uint32_t heartbeat_check_everyN = 100;
|
||||||
static constexpr uint32_t heartbeat_timeout_us =
|
static constexpr uint32_t heartbeat_timeout_us =
|
||||||
heartbeat_check_everyN * heartbeat_us;
|
heartbeat_check_everyN * heartbeat_us;
|
||||||
|
|
||||||
@@ -52,6 +52,8 @@ static inline uint64_t heartbeat_ts_us() {
|
|||||||
.count());
|
.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GPU flag layout (64 bits):
|
||||||
|
// [lock (1 bit) | reserved (31 bits) | tgid (32 bits)]
|
||||||
static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
|
static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
|
||||||
return static_cast<uint32_t>(gpu_flag >> 32);
|
return static_cast<uint32_t>(gpu_flag >> 32);
|
||||||
}
|
}
|
||||||
@@ -68,16 +70,43 @@ static inline uint64_t pack_unlocked_tgid(int32_t tgid) {
|
|||||||
return static_cast<uint64_t>(tgid);
|
return static_cast<uint64_t>(tgid);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// waiting_worker_flag layout (64 bits):
|
||||||
|
// [ device_id (5 bits) | priority (3 bits) | timestamp (24 bits) | tgid (32 bits)]
|
||||||
|
|
||||||
|
static inline uint32_t unpack_waiting_device_id(uint64_t flag) {
|
||||||
|
return static_cast<uint32_t>(flag >> 59);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint16_t unpack_waiting_priority(uint64_t flag) {
|
||||||
|
return static_cast<uint16_t>((flag >> 56) & 0x7);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t unpack_waiting_timestamp_ms(uint64_t flag) {
|
||||||
|
return static_cast<uint32_t>((flag >> 32) & 0xFFFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int32_t unpack_waiting_tgid(uint64_t flag) {
|
||||||
|
return static_cast<int32_t>(flag & 0xFFFFFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint64_t pack_waiting_flag(uint32_t device_id, uint16_t priority,
|
||||||
|
uint32_t timestamp, int32_t tgid) {
|
||||||
|
return (static_cast<uint64_t>(device_id & 0x1F) << 59) |
|
||||||
|
(static_cast<uint64_t>(priority & 0x7) << 56) |
|
||||||
|
(static_cast<uint64_t>(timestamp & 0xFFFFFF) << 32) |
|
||||||
|
(static_cast<uint64_t>(tgid) & 0xFFFFFFFF);
|
||||||
|
}
|
||||||
|
|
||||||
// mmap usually page-aligned
|
// mmap usually page-aligned
|
||||||
struct alignas(64) ShmHelper {
|
struct alignas(64) ShmHelper {
|
||||||
struct VramInfo {
|
struct VramInfo {
|
||||||
uint64_t total_vmem_size;
|
uint64_t total_vmem_size;
|
||||||
uint64_t shareable_handle;
|
uint64_t shareable_handle;
|
||||||
};
|
};
|
||||||
VramInfo vram_info[MAX_DEVICES]; // support max 16 NPUs
|
VramInfo vram_info[MAX_DEVICES]; // support max 32 devices
|
||||||
// GPU lock flag
|
// GPU lock flag
|
||||||
std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
|
std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
|
||||||
// uint8_t _padding1[64 - sizeof(std::atomic<uint64_t>)];
|
std::atomic<uint64_t> waiting_worker_flags[MAX_WORKERS];
|
||||||
|
|
||||||
// request
|
// request
|
||||||
enum RequestType: uint32_t {
|
enum RequestType: uint32_t {
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ void ShmManager::run_busy_loop() {
|
|||||||
while (!stop_loop_flag.load(std::memory_order_acquire)) {
|
while (!stop_loop_flag.load(std::memory_order_acquire)) {
|
||||||
process_requests();
|
process_requests();
|
||||||
|
|
||||||
if (loop_cnt % heartbeat_check_everyN== 0) {
|
if (loop_cnt % heartbeat_check_everyN == 0) {
|
||||||
check_heart_beats();
|
check_heart_beats();
|
||||||
}
|
}
|
||||||
loop_cnt = (loop_cnt + 1) % heartbeat_check_everyN;
|
loop_cnt = (loop_cnt + 1) % heartbeat_check_everyN;
|
||||||
@@ -152,6 +152,8 @@ void ShmManager::check_heart_beats() {
|
|||||||
shm_helper->heart_beats[i].tgid = 0;
|
shm_helper->heart_beats[i].tgid = 0;
|
||||||
shm_helper->heart_beats[i].timestamp.store(0,
|
shm_helper->heart_beats[i].timestamp.store(0,
|
||||||
std::memory_order_release);
|
std::memory_order_release);
|
||||||
|
// clear waiting flag
|
||||||
|
shm_helper->waiting_worker_flags[i].store(0, std::memory_order_release);
|
||||||
// check dead lock
|
// check dead lock
|
||||||
for (int gpu_id : valid_gpu_ids) {
|
for (int gpu_id : valid_gpu_ids) {
|
||||||
uint64_t gpu_flag =
|
uint64_t gpu_flag =
|
||||||
|
|||||||
@@ -1,7 +1,29 @@
|
|||||||
#include "shm_worker.h"
|
#include "shm_worker.h"
|
||||||
|
|
||||||
|
static inline uint16_t get_shm_priority() {
|
||||||
|
const char *env_priority = getenv("VLLM_VNPU_PRIORITY");
|
||||||
|
if (env_priority) {
|
||||||
|
try {
|
||||||
|
int p = std::stoi(env_priority);
|
||||||
|
if (p >= 0 && p <= 7) {
|
||||||
|
return static_cast<uint16_t>(p);
|
||||||
|
} else {
|
||||||
|
spdlog::warn("VLLM_VNPU_PRIORITY should be between 0 and 7, got {}. Using default 0.", p);
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
spdlog::warn("Invalid VLLM_VNPU_PRIORITY format. Using default 0.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
ShmWorker::ShmWorker() {
|
ShmWorker::ShmWorker() {
|
||||||
|
this->priority = get_shm_priority();
|
||||||
|
this->waiting_timestamp = 0;
|
||||||
|
this->is_waiting = false;
|
||||||
|
this->is_holding_lock = false;
|
||||||
|
spdlog::info("vNPU worker initialized with priority {}", priority);
|
||||||
std::string shm_name = get_shm_name();
|
std::string shm_name = get_shm_name();
|
||||||
int shm_fd = shm_open(shm_name.c_str(), O_RDWR, 0666);
|
int shm_fd = shm_open(shm_name.c_str(), O_RDWR, 0666);
|
||||||
if (shm_fd == -1) {
|
if (shm_fd == -1) {
|
||||||
@@ -40,16 +62,18 @@ bool ShmWorker::register_worker(int32_t tgid, int gpu_id,
|
|||||||
if (slot == -1) {
|
if (slot == -1) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
this->shm_slot = slot;
|
||||||
|
|
||||||
*out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle;
|
*out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle;
|
||||||
*out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size;
|
*out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size;
|
||||||
|
|
||||||
stop_heart_beat.store(false, std::memory_order_release);
|
stop_heart_beat.store(false, std::memory_order_release);
|
||||||
heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this, slot);
|
heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShmWorker::heart_beat_loop(int slot) {
|
void ShmWorker::heart_beat_loop() {
|
||||||
|
int slot = this->shm_slot;
|
||||||
while (!stop_heart_beat.load(std::memory_order_acquire)) {
|
while (!stop_heart_beat.load(std::memory_order_acquire)) {
|
||||||
// update heart beat
|
// update heart beat
|
||||||
int32_t shm_tgid =
|
int32_t shm_tgid =
|
||||||
@@ -64,6 +88,7 @@ void ShmWorker::heart_beat_loop(int slot) {
|
|||||||
spdlog::error("TGID {} failed to re-register as worker", tgid);
|
spdlog::error("TGID {} failed to re-register as worker", tgid);
|
||||||
throw std::runtime_error("Failed to re-register as worker");
|
throw std::runtime_error("Failed to re-register as worker");
|
||||||
}
|
}
|
||||||
|
this->shm_slot = slot;
|
||||||
}
|
}
|
||||||
uint64_t now = heartbeat_ts_us();
|
uint64_t now = heartbeat_ts_us();
|
||||||
shm_helper->heart_beats[slot].timestamp.store(now,
|
shm_helper->heart_beats[slot].timestamp.store(now,
|
||||||
@@ -72,32 +97,95 @@ void ShmWorker::heart_beat_loop(int slot) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ShmWorker::start_wait() {
|
||||||
|
if (is_waiting) return; // Keep the older timestamp if already waiting
|
||||||
|
|
||||||
|
// Use lower 24 bits of millisecond timestamp
|
||||||
|
waiting_timestamp = static_cast<uint32_t>((heartbeat_ts_us() / 1000) & 0xFFFFFF);
|
||||||
|
|
||||||
|
uint64_t flag = pack_waiting_flag(this->gpu_id, this->priority, waiting_timestamp, this->tgid);
|
||||||
|
shm_helper->waiting_worker_flags[this->shm_slot].store(flag, std::memory_order_release);
|
||||||
|
is_waiting = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ShmWorker::cancel_wait() {
|
||||||
|
if (!is_waiting) return;
|
||||||
|
|
||||||
|
shm_helper->waiting_worker_flags[this->shm_slot].store(0, std::memory_order_release);
|
||||||
|
is_waiting = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ShmWorker::has_higher_priority_waiter() {
|
||||||
|
for (int i = 0; i < MAX_WORKERS; ++i) {
|
||||||
|
if (i == this->shm_slot) continue;
|
||||||
|
|
||||||
|
uint64_t flag = shm_helper->waiting_worker_flags[i].load(std::memory_order_acquire);
|
||||||
|
if (flag == 0) continue;
|
||||||
|
if (unpack_waiting_device_id(flag) != this->gpu_id) continue;
|
||||||
|
|
||||||
|
uint16_t other_prio = unpack_waiting_priority(flag);
|
||||||
|
|
||||||
|
if (other_prio > this->priority) {
|
||||||
|
return true; // Found a waiter with higher priority
|
||||||
|
} else if (other_prio == this->priority) {
|
||||||
|
if (this->is_holding_lock) {
|
||||||
|
// doesn't need to yield to same priority waiters
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!this->is_waiting) {
|
||||||
|
// an earlier waiter with the same priority
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
uint32_t other_ts = unpack_waiting_timestamp_ms(flag);
|
||||||
|
// Same priority, compare timestamps (handle 24-bit wrap-around)
|
||||||
|
// Using 24-bit unsigned subtraction. If the difference is in the lower half,
|
||||||
|
// my timestamp is greater (i.e., I started waiting later).
|
||||||
|
uint32_t diff = (this->waiting_timestamp - other_ts) & 0xFFFFFF;
|
||||||
|
if (diff > 0 && diff < 0x800000) {
|
||||||
|
return true; // The other worker started waiting earlier
|
||||||
|
} else if (diff == 0 && unpack_waiting_tgid(flag) < this->tgid) {
|
||||||
|
// using tgid if timestamps happen to be exactly the same
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool ShmWorker::try_lock_gpu(bool &out_self_hold) {
|
bool ShmWorker::try_lock_gpu(bool &out_self_hold) {
|
||||||
static int retry_cnt = 0;
|
static int retry_cnt = 0;
|
||||||
|
|
||||||
uint64_t old_flag =
|
uint64_t old_flag =
|
||||||
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
||||||
if (unpack_lock_field(old_flag) == 0) { // free
|
if (unpack_lock_field(old_flag) == 0) { // free
|
||||||
|
// Check priority: yield if there are higher priority waiters, or same priority waiters who have waited longer.
|
||||||
|
if (has_higher_priority_waiter()) {
|
||||||
|
out_self_hold = false;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t new_flag = pack_locked_tgid(tgid);
|
uint64_t new_flag = pack_locked_tgid(tgid);
|
||||||
if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak(
|
if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak(
|
||||||
old_flag, new_flag, std::memory_order_acq_rel,
|
old_flag, new_flag, std::memory_order_acq_rel,
|
||||||
std::memory_order_acquire)) {
|
std::memory_order_acquire)) {
|
||||||
spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
|
// spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
|
||||||
int32_t prev_tgid = unpack_tgid_field(old_flag);
|
int32_t prev_tgid = unpack_tgid_field(old_flag);
|
||||||
out_self_hold = prev_tgid == tgid;
|
out_self_hold = prev_tgid == tgid;
|
||||||
retry_cnt = 0;
|
retry_cnt = 0;
|
||||||
|
this->is_holding_lock = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} else { // locked
|
} else { // locked
|
||||||
if (unpack_tgid_field(old_flag) == tgid) {
|
if (unpack_tgid_field(old_flag) == tgid) {
|
||||||
spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
|
// spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
|
||||||
out_self_hold = true;
|
out_self_hold = true;
|
||||||
retry_cnt = 0;
|
retry_cnt = 0;
|
||||||
|
this->is_holding_lock = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// failed
|
// failed
|
||||||
if (++retry_cnt % 2000 == 0) {
|
if (++retry_cnt % 10000 == 0) {
|
||||||
spdlog::info(
|
spdlog::info(
|
||||||
"TGID {} trying to acquire GPU {} lock, current lock holder TGID {}",
|
"TGID {} trying to acquire GPU {} lock, current lock holder TGID {}",
|
||||||
tgid, gpu_id, unpack_tgid_field(old_flag));
|
tgid, gpu_id, unpack_tgid_field(old_flag));
|
||||||
@@ -116,19 +204,23 @@ bool ShmWorker::lock_gpu(bool &out_self_hold) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ShmWorker::unlock_gpu() {
|
void ShmWorker::unlock_gpu(bool keep_wait) {
|
||||||
|
if (!keep_wait) {
|
||||||
|
cancel_wait();
|
||||||
|
}
|
||||||
|
|
||||||
uint64_t old_flag =
|
uint64_t old_flag =
|
||||||
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
||||||
if (unpack_tgid_field(old_flag) != tgid) {
|
if (unpack_tgid_field(old_flag) != tgid) {
|
||||||
// spdlog::warn("previous gpu flag {} does not match expected locked flag for "
|
if (!keep_wait) {
|
||||||
// "TGID {}. This may be a bug, unless during startup.",
|
spdlog::info("unlock: TGID {} does not hold GPU {} lock", tgid, gpu_id);
|
||||||
// old_flag, tgid);
|
}
|
||||||
spdlog::info("TGID {} does not hold GPU {} lock", tgid, gpu_id);
|
|
||||||
} else {
|
} else {
|
||||||
uint64_t new_flag = pack_unlocked_tgid(tgid);
|
uint64_t new_flag = pack_unlocked_tgid(tgid);
|
||||||
shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release);
|
shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release);
|
||||||
spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
|
// spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
|
||||||
}
|
}
|
||||||
|
this->is_holding_lock = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) {
|
uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) {
|
||||||
|
|||||||
@@ -17,11 +17,21 @@ class ShmWorker {
|
|||||||
|
|
||||||
bool try_lock_gpu(bool &out_self_hold);
|
bool try_lock_gpu(bool &out_self_hold);
|
||||||
bool lock_gpu(bool &out_self_hold);
|
bool lock_gpu(bool &out_self_hold);
|
||||||
void unlock_gpu();
|
void unlock_gpu(bool keep_wait = false);
|
||||||
|
|
||||||
|
bool has_higher_priority_waiter();
|
||||||
|
void start_wait();
|
||||||
|
void cancel_wait();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int32_t tgid;
|
int32_t tgid;
|
||||||
int gpu_id;
|
int gpu_id;
|
||||||
|
int shm_slot;
|
||||||
|
uint16_t priority;
|
||||||
|
uint32_t waiting_timestamp;
|
||||||
|
bool is_waiting;
|
||||||
|
bool is_holding_lock;
|
||||||
|
|
||||||
ShmHelper *shm_helper;
|
ShmHelper *shm_helper;
|
||||||
std::thread heart_beat_thread;
|
std::thread heart_beat_thread;
|
||||||
std::atomic<bool> stop_heart_beat;
|
std::atomic<bool> stop_heart_beat;
|
||||||
@@ -31,5 +41,5 @@ class ShmWorker {
|
|||||||
int register_worker_shm();
|
int register_worker_shm();
|
||||||
|
|
||||||
// heart beat
|
// heart beat
|
||||||
void heart_beat_loop(int slot);
|
void heart_beat_loop();
|
||||||
};
|
};
|
||||||
@@ -1,25 +1,29 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
|
||||||
#include <sys/mman.h>
|
|
||||||
#include <sys/stat.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <vector>
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
#include <fcntl.h>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "acl/acl.h"
|
#include "acl/acl.h"
|
||||||
|
|
||||||
#include "shm_manager.h"
|
|
||||||
#include "npu_helper.h"
|
#include "npu_helper.h"
|
||||||
|
#include "shm_manager.h"
|
||||||
#include "spdlog/spdlog.h"
|
#include "spdlog/spdlog.h"
|
||||||
|
|
||||||
|
|
||||||
static ShmManager *shm_manager = nullptr;
|
static ShmManager *shm_manager = nullptr;
|
||||||
|
|
||||||
|
static inline double TO_GB(size_t bytes) {
|
||||||
|
return static_cast<double>(bytes) / (1024.0 * 1024.0 * 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
void handle_signal(int sig) {
|
void handle_signal(int sig) {
|
||||||
if (shm_manager) {
|
if (shm_manager) {
|
||||||
shm_manager->stop_busy_loop();
|
shm_manager->stop_busy_loop();
|
||||||
@@ -49,7 +53,7 @@ size_t get_reserved_vram_size() {
|
|||||||
reserved_vram_size = size_gb * 1024 * 1024 * 1024;
|
reserved_vram_size = size_gb * 1024 * 1024 * 1024;
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
spdlog::warn("Failed to parse VNPU_RESERVED_VRAM_SIZE_GB: {}, using "
|
spdlog::warn("Failed to parse VNPU_RESERVED_VRAM_SIZE_GB: {}, using "
|
||||||
"default 8GB",
|
"default 8 GB",
|
||||||
e.what());
|
e.what());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -68,12 +72,13 @@ void ensure_context(unsigned long long device) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void init_acl() {
|
void init_acl() {
|
||||||
int32_t deviceId=0;
|
int32_t deviceId = 0;
|
||||||
|
|
||||||
aclError ret = aclrtSetDevice(deviceId);
|
aclError ret = aclrtSetDevice(deviceId);
|
||||||
if (ret != ACL_ERROR_NONE) {
|
if (ret != ACL_ERROR_NONE) {
|
||||||
throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
|
throw std::runtime_error(
|
||||||
std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
"aclrtSetDevice failed with acl error code: " + std::to_string(ret) +
|
||||||
|
" " + __FILE__ + ":" + std::to_string(__LINE__));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -109,8 +114,9 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
|
|||||||
spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
|
spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
|
||||||
throw std::runtime_error("aclrtGetMemInfo failed");
|
throw std::runtime_error("aclrtGetMemInfo failed");
|
||||||
} else {
|
} else {
|
||||||
spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem,
|
spdlog::info(
|
||||||
total);
|
"aclrtGetMemInfo succeeded, free_mem: {:.2f} GB, total: {:.2f} GB",
|
||||||
|
TO_GB(free_mem), TO_GB(total));
|
||||||
}
|
}
|
||||||
|
|
||||||
aclrtPhysicalMemProp prop = {};
|
aclrtPhysicalMemProp prop = {};
|
||||||
@@ -129,13 +135,14 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
|
|||||||
error_code);
|
error_code);
|
||||||
throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
|
throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
|
||||||
} else {
|
} else {
|
||||||
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}",
|
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {} bytes",
|
||||||
granularity);
|
granularity);
|
||||||
}
|
}
|
||||||
size_t reserved_mem_size = get_reserved_vram_size();
|
size_t reserved_mem_size = get_reserved_vram_size();
|
||||||
if (free_mem < reserved_mem_size) {
|
if (free_mem < reserved_mem_size) {
|
||||||
spdlog::error("Not enough free memory to reserve: {}, free_mem: {}",
|
spdlog::error(
|
||||||
reserved_mem_size, free_mem);
|
"Not enough free memory to reserve: {:.2f} GB, free_mem: {:.2f} GB",
|
||||||
|
TO_GB(reserved_mem_size), TO_GB(free_mem));
|
||||||
throw std::runtime_error("Not enough free memory to reserve");
|
throw std::runtime_error("Not enough free memory to reserve");
|
||||||
}
|
}
|
||||||
out_g_size = free_mem - reserved_mem_size;
|
out_g_size = free_mem - reserved_mem_size;
|
||||||
@@ -147,8 +154,8 @@ void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
|
|||||||
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
|
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
|
||||||
throw std::runtime_error("aclrtMallocPhysical failed");
|
throw std::runtime_error("aclrtMallocPhysical failed");
|
||||||
} else {
|
} else {
|
||||||
spdlog::info("device {} aclrtMallocPhysical succeeded, size: {}", device_id,
|
spdlog::info("device {} aclrtMallocPhysical succeeded, size: {:.2f} GB",
|
||||||
out_g_size);
|
device_id, TO_GB(out_g_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,10 @@ try:
|
|||||||
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
|
python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
|
||||||
python_get_mem_info_offload as python_get_mem_info,
|
python_get_mem_info_offload as python_get_mem_info,
|
||||||
python_try_lock_gpu_offload as python_try_lock_gpu,
|
python_try_lock_gpu_offload as python_try_lock_gpu,
|
||||||
python_unlock_gpu_offload as python_unlock_gpu
|
python_unlock_gpu_offload as python_unlock_gpu,
|
||||||
|
python_start_wait_offload as python_start_wait,
|
||||||
|
python_cancel_wait_offload as python_cancel_wait,
|
||||||
|
python_has_higher_priority_waiter_offload as python_has_higher_priority_waiter
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
|
from vllm_ascend.vllm_ascend_C import ( # type: ignore # noqa: F401
|
||||||
@@ -73,6 +76,9 @@ try:
|
|||||||
python_get_mem_info = None
|
python_get_mem_info = None
|
||||||
python_try_lock_gpu = None
|
python_try_lock_gpu = None
|
||||||
python_unlock_gpu = None
|
python_unlock_gpu = None
|
||||||
|
python_start_wait = None
|
||||||
|
python_cancel_wait = None
|
||||||
|
python_has_higher_priority_waiter = None
|
||||||
|
|
||||||
lib_name = find_loaded_library("vllm_ascend_C")
|
lib_name = find_loaded_library("vllm_ascend_C")
|
||||||
camem_available = True
|
camem_available = True
|
||||||
@@ -84,6 +90,9 @@ except ImportError as e:
|
|||||||
python_get_mem_info = None
|
python_get_mem_info = None
|
||||||
python_try_lock_gpu = None
|
python_try_lock_gpu = None
|
||||||
python_unlock_gpu = None
|
python_unlock_gpu = None
|
||||||
|
python_start_wait = None
|
||||||
|
python_cancel_wait = None
|
||||||
|
python_has_higher_priority_waiter = None
|
||||||
lib_name = None
|
lib_name = None
|
||||||
libcudart = None
|
libcudart = None
|
||||||
|
|
||||||
@@ -306,15 +315,37 @@ class CaMemAllocator:
|
|||||||
return False, False
|
return False, False
|
||||||
|
|
||||||
def _vnpu_lock_gpu(self) -> bool:
|
def _vnpu_lock_gpu(self) -> bool:
|
||||||
|
is_waiting = False
|
||||||
while True:
|
while True:
|
||||||
success, _ = self.vnpu_try_lock_gpu()
|
success, _ = self.vnpu_try_lock_gpu()
|
||||||
if success:
|
if success:
|
||||||
|
if is_waiting:
|
||||||
|
self.vnpu_cancel_wait()
|
||||||
return True
|
return True
|
||||||
|
else:
|
||||||
|
if not is_waiting:
|
||||||
|
self.vnpu_start_wait()
|
||||||
|
is_waiting = True
|
||||||
|
self.vnpu_unlock_gpu(keep_wait=True)
|
||||||
|
|
||||||
time.sleep(0.001)
|
time.sleep(0.001)
|
||||||
|
|
||||||
def vnpu_unlock_gpu(self):
|
def vnpu_unlock_gpu(self, keep_wait: bool = False):
|
||||||
if python_unlock_gpu:
|
if python_unlock_gpu:
|
||||||
python_unlock_gpu()
|
python_unlock_gpu(keep_wait)
|
||||||
|
|
||||||
|
def vnpu_start_wait(self) -> None:
|
||||||
|
if python_start_wait:
|
||||||
|
python_start_wait()
|
||||||
|
|
||||||
|
def vnpu_cancel_wait(self) -> None:
|
||||||
|
if python_cancel_wait:
|
||||||
|
python_cancel_wait()
|
||||||
|
|
||||||
|
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||||
|
if python_has_higher_priority_waiter:
|
||||||
|
return python_has_higher_priority_waiter()
|
||||||
|
return False
|
||||||
|
|
||||||
def get_pool_mem_info(self) -> tuple[int, int]:
|
def get_pool_mem_info(self) -> tuple[int, int]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
|
from concurrent.futures import Future
|
||||||
from logging import DEBUG
|
from logging import DEBUG
|
||||||
import signal
|
|
||||||
import queue
|
import queue
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
|
||||||
from vllm.config import ParallelConfig, VllmConfig
|
from vllm.config import ParallelConfig, VllmConfig
|
||||||
from vllm.logger import logger
|
from vllm.logger import logger
|
||||||
@@ -98,8 +100,13 @@ def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
|
|||||||
if engine_core is not None:
|
if engine_core is not None:
|
||||||
engine_core.shutdown()
|
engine_core.shutdown()
|
||||||
|
|
||||||
|
|
||||||
def run_busy_loop(self):
|
def run_busy_loop(self):
|
||||||
"""Core busy loop of the EngineCore."""
|
"""Core busy loop of the EngineCore."""
|
||||||
|
# vnpu yield
|
||||||
|
yield_probe_counter = 0
|
||||||
|
prepared_yield = False
|
||||||
|
|
||||||
while self._handle_shutdown():
|
while self._handle_shutdown():
|
||||||
# 1) Poll the input queue until there is work to do.
|
# 1) Poll the input queue until there is work to do.
|
||||||
self._process_input_queue()
|
self._process_input_queue()
|
||||||
@@ -111,14 +118,52 @@ def run_busy_loop(self):
|
|||||||
prev_is_self = self.model_executor.reload_vram()
|
prev_is_self = self.model_executor.reload_vram()
|
||||||
if not prev_is_self:
|
if not prev_is_self:
|
||||||
self.reset_prefix_cache()
|
self.reset_prefix_cache()
|
||||||
|
|
||||||
# 2) Step the engine core and return the outputs.
|
# 2) Step the engine core and return the outputs.
|
||||||
self._process_engine_step()
|
self._process_engine_step()
|
||||||
if (
|
|
||||||
envs_ascend.VLLM_ASCEND_ENABLE_VNPU
|
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||||
and not self.has_work()
|
if not self.has_work():
|
||||||
and not self.model_executor.is_offloaded()
|
if not self.model_executor.is_offloaded():
|
||||||
):
|
self.model_executor.offload_vram()
|
||||||
self.model_executor.offload_vram()
|
elif not prepared_yield:
|
||||||
|
# check should yield every 10 steps
|
||||||
|
yield_probe_counter = (yield_probe_counter + 1) % 10
|
||||||
|
if yield_probe_counter == 0:
|
||||||
|
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
|
||||||
|
if should_yield:
|
||||||
|
logger.info(
|
||||||
|
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
|
||||||
|
)
|
||||||
|
prepared_yield = True
|
||||||
|
pause_future = self.pause_scheduler(
|
||||||
|
mode="wait", clear_cache=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def pause_complete(f: Future):
|
||||||
|
nonlocal prepared_yield
|
||||||
|
try:
|
||||||
|
if f:
|
||||||
|
f.result()
|
||||||
|
if not self.model_executor.is_offloaded():
|
||||||
|
self.model_executor.offload_vram(is_yield=True)
|
||||||
|
prepared_yield = False
|
||||||
|
logger.info("Current engine has yielded.")
|
||||||
|
# Scheduler should wake up itself after yielding.
|
||||||
|
# Sleep some time to give chance to other worker.
|
||||||
|
time.sleep(2)
|
||||||
|
self.resume_scheduler()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Failed to yield: {e}.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
if pause_future is None:
|
||||||
|
# pause finished, no in-flight requests
|
||||||
|
pause_complete(None)
|
||||||
|
else:
|
||||||
|
# pause_future will be set after all in-flight
|
||||||
|
# requests are finished in _process_input_queue
|
||||||
|
pause_future.add_done_callback(pause_complete)
|
||||||
|
|
||||||
raise SystemExit
|
raise SystemExit
|
||||||
|
|
||||||
@@ -142,7 +187,8 @@ def _process_input_queue(self):
|
|||||||
and not self.model_executor.is_offloaded()
|
and not self.model_executor.is_offloaded()
|
||||||
):
|
):
|
||||||
self.model_executor.offload_vram()
|
self.model_executor.offload_vram()
|
||||||
block = self.process_input_queue_block
|
# vNPU: if scheduler is resumed and has work, should not block
|
||||||
|
block = self.process_input_queue_block and not self.has_work()
|
||||||
try:
|
try:
|
||||||
req = self.input_queue.get(block=block)
|
req = self.input_queue.get(block=block)
|
||||||
self._handle_client_request(*req)
|
self._handle_client_request(*req)
|
||||||
@@ -169,6 +215,9 @@ EngineCoreProc._process_input_queue = _process_input_queue
|
|||||||
|
|
||||||
def DPEngineCoreProc_run_busy_loop(self):
|
def DPEngineCoreProc_run_busy_loop(self):
|
||||||
"""Core busy loop of the EngineCore for data parallel case."""
|
"""Core busy loop of the EngineCore for data parallel case."""
|
||||||
|
# vnpu yield
|
||||||
|
yield_probe_counter = 0
|
||||||
|
prepared_yield = False
|
||||||
|
|
||||||
# Loop until process is sent a SIGINT or SIGTERM
|
# Loop until process is sent a SIGINT or SIGTERM
|
||||||
while self._handle_shutdown():
|
while self._handle_shutdown():
|
||||||
@@ -227,12 +276,48 @@ def DPEngineCoreProc_run_busy_loop(self):
|
|||||||
self.current_wave += 1
|
self.current_wave += 1
|
||||||
self.step_counter = 0
|
self.step_counter = 0
|
||||||
|
|
||||||
if (
|
if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
|
||||||
envs_ascend.VLLM_ASCEND_ENABLE_VNPU
|
if not self.has_work():
|
||||||
and not self.has_work()
|
if not self.model_executor.is_offloaded():
|
||||||
and not self.model_executor.is_offloaded()
|
self.model_executor.offload_vram()
|
||||||
):
|
elif not prepared_yield:
|
||||||
self.model_executor.offload_vram()
|
# check should yield every 10 steps
|
||||||
|
yield_probe_counter = (yield_probe_counter + 1) % 10
|
||||||
|
if yield_probe_counter == 0:
|
||||||
|
should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
|
||||||
|
if should_yield:
|
||||||
|
logger.info(
|
||||||
|
"Found other higher priority worker. Current engine will yield after finishing in-flight requests."
|
||||||
|
)
|
||||||
|
prepared_yield = True
|
||||||
|
pause_future = self.pause_scheduler(
|
||||||
|
mode="wait", clear_cache=True
|
||||||
|
)
|
||||||
|
|
||||||
|
def pause_complete(f: Future):
|
||||||
|
nonlocal prepared_yield
|
||||||
|
try:
|
||||||
|
if f:
|
||||||
|
f.result()
|
||||||
|
if not self.model_executor.is_offloaded():
|
||||||
|
self.model_executor.offload_vram(is_yield=True)
|
||||||
|
prepared_yield = False
|
||||||
|
logger.info("Current engine has yielded.")
|
||||||
|
# Scheduler should wake up itself after yielding.
|
||||||
|
# Sleep some time to give chance to other worker.
|
||||||
|
time.sleep(2)
|
||||||
|
self.resume_scheduler()
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("Failed to yield: {e}.")
|
||||||
|
raise e
|
||||||
|
|
||||||
|
if pause_future is None:
|
||||||
|
# pause finished, no in-flight requests
|
||||||
|
pause_complete(None)
|
||||||
|
else:
|
||||||
|
# pause_future will be set after all in-flight
|
||||||
|
# requests are finished in _process_input_queue
|
||||||
|
pause_future.add_done_callback(pause_complete)
|
||||||
|
|
||||||
raise SystemExit
|
raise SystemExit
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,12 @@ def is_offloaded(self) -> bool:
|
|||||||
self._is_offloaded = False
|
self._is_offloaded = False
|
||||||
return self._is_offloaded
|
return self._is_offloaded
|
||||||
|
|
||||||
def offload_vram(self):
|
def is_yielded(self) -> bool:
|
||||||
|
if not hasattr(self, "_is_yielded"):
|
||||||
|
self._is_yielded = False
|
||||||
|
return self._is_yielded
|
||||||
|
|
||||||
|
def offload_vram(self, is_yield: bool = False):
|
||||||
if self.is_offloaded():
|
if self.is_offloaded():
|
||||||
logger.warning("Executor is already offloaded.")
|
logger.warning("Executor is already offloaded.")
|
||||||
return
|
return
|
||||||
@@ -17,14 +22,18 @@ def offload_vram(self):
|
|||||||
time_after_offload = time.perf_counter()
|
time_after_offload = time.perf_counter()
|
||||||
|
|
||||||
self._is_offloaded = True
|
self._is_offloaded = True
|
||||||
logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.")
|
if is_yield:
|
||||||
|
self._is_yielded = True
|
||||||
|
logger.info(
|
||||||
|
f"Offloading VRAM costs {time_after_offload - time_before_offload:.3f} seconds."
|
||||||
|
)
|
||||||
|
|
||||||
def reload_vram(self) -> bool:
|
def reload_vram(self) -> bool:
|
||||||
if not self.is_offloaded():
|
if not self.is_offloaded():
|
||||||
logger.warning("Executor is not offloaded.")
|
logger.warning("Executor is not offloaded.")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
is_waiting = False
|
||||||
while True:
|
while True:
|
||||||
time_before_reload = time.perf_counter()
|
time_before_reload = time.perf_counter()
|
||||||
res = self.collective_rpc("try_reload_vram")
|
res = self.collective_rpc("try_reload_vram")
|
||||||
@@ -33,15 +42,28 @@ def reload_vram(self) -> bool:
|
|||||||
succ = all(x[0] for x in res)
|
succ = all(x[0] for x in res)
|
||||||
if succ:
|
if succ:
|
||||||
self._is_offloaded = False
|
self._is_offloaded = False
|
||||||
logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
|
self._is_yielded = False
|
||||||
prev_is_self = all(x[1] for x in res)
|
prev_is_self = all(x[1] for x in res)
|
||||||
|
if is_waiting:
|
||||||
|
self.collective_rpc("vnpu_cancel_wait")
|
||||||
|
logger.info(
|
||||||
|
f"Reloading VRAM costs {time_after_reload - time_before_reload:.3f} seconds."
|
||||||
|
)
|
||||||
return prev_is_self
|
return prev_is_self
|
||||||
else:
|
else:
|
||||||
# some workers not get lock
|
# some workers not get lock
|
||||||
self.collective_rpc("vnpu_unlock_gpu")
|
if not is_waiting:
|
||||||
|
self.collective_rpc("vnpu_start_wait")
|
||||||
|
is_waiting = True
|
||||||
|
self.collective_rpc("vnpu_unlock_gpu", kwargs={"keep_wait": True})
|
||||||
time.sleep(0.001)
|
time.sleep(0.001)
|
||||||
|
|
||||||
|
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||||
|
res = self.collective_rpc("vnpu_has_higher_priority_waiter")
|
||||||
|
return any(res)
|
||||||
|
|
||||||
|
|
||||||
Executor.is_offloaded = is_offloaded
|
Executor.is_offloaded = is_offloaded
|
||||||
Executor.offload_vram = offload_vram
|
Executor.offload_vram = offload_vram
|
||||||
Executor.reload_vram = reload_vram
|
Executor.reload_vram = reload_vram
|
||||||
|
Executor.vnpu_has_higher_priority_waiter = vnpu_has_higher_priority_waiter
|
||||||
|
|||||||
@@ -470,7 +470,7 @@ class NPUWorker(WorkerBase):
|
|||||||
# save memory to host with lock
|
# save memory to host with lock
|
||||||
self.offload_vram()
|
self.offload_vram()
|
||||||
succ, _ = self.try_reload_vram()
|
succ, _ = self.try_reload_vram()
|
||||||
assert succ, "Failed to reload model weights after offloading."
|
# assert succ, "Failed to reload model weights after offloading."
|
||||||
|
|
||||||
def offload_vram(self) -> None:
|
def offload_vram(self) -> None:
|
||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
@@ -480,9 +480,21 @@ class NPUWorker(WorkerBase):
|
|||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
return allocator.try_reload_vram(tags=None)
|
return allocator.try_reload_vram(tags=None)
|
||||||
|
|
||||||
def vnpu_unlock_gpu(self) -> None:
|
def vnpu_unlock_gpu(self, keep_wait: bool = False) -> None:
|
||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
allocator.vnpu_unlock_gpu()
|
allocator.vnpu_unlock_gpu(keep_wait)
|
||||||
|
|
||||||
|
def vnpu_start_wait(self) -> None:
|
||||||
|
allocator = CaMemAllocator.get_instance()
|
||||||
|
allocator.vnpu_start_wait()
|
||||||
|
|
||||||
|
def vnpu_cancel_wait(self) -> None:
|
||||||
|
allocator = CaMemAllocator.get_instance()
|
||||||
|
allocator.vnpu_cancel_wait()
|
||||||
|
|
||||||
|
def vnpu_has_higher_priority_waiter(self) -> bool:
|
||||||
|
allocator = CaMemAllocator.get_instance()
|
||||||
|
return allocator.vnpu_has_higher_priority_waiter()
|
||||||
|
|
||||||
def compile_or_warm_up_model(self) -> float:
|
def compile_or_warm_up_model(self) -> float:
|
||||||
# Note: need to adapt for graph mode.
|
# Note: need to adapt for graph mode.
|
||||||
|
|||||||
Reference in New Issue
Block a user