support multi npu partially
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
#include "shm_worker.h"
|
||||
|
||||
// === ShmWorker ===
|
||||
|
||||
ShmWorker::ShmWorker() {
|
||||
std::string shm_name = get_shm_name();
|
||||
@@ -29,16 +28,22 @@ ShmWorker::~ShmWorker() {
|
||||
munmap(shm_helper, SHM_SIZE);
|
||||
}
|
||||
|
||||
bool ShmWorker::register_worker(int32_t tgid, uint64_t *out_shareable_handle,
|
||||
bool ShmWorker::register_worker(int32_t tgid, int gpu_id,
|
||||
uint64_t *out_shareable_handle,
|
||||
uint64_t *out_vmem_size) {
|
||||
if (gpu_id < 0 || gpu_id >= MAX_DEVICES) {
|
||||
spdlog::error("Invalid GPU ID {}", gpu_id);
|
||||
throw std::runtime_error("Invalid GPU ID");
|
||||
}
|
||||
this->tgid = tgid;
|
||||
this->gpu_id = gpu_id;
|
||||
int slot = register_worker_shm();
|
||||
if (slot == -1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*out_shareable_handle = shm_helper->shareable_handle;
|
||||
*out_vmem_size = shm_helper->total_vmem_size;
|
||||
*out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle;
|
||||
*out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size;
|
||||
|
||||
stop_heart_beat.store(false, std::memory_order_release);
|
||||
heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this, slot);
|
||||
@@ -68,47 +73,62 @@ void ShmWorker::heart_beat_loop(int slot) {
|
||||
}
|
||||
}
|
||||
|
||||
bool ShmWorker::lock_gpu() {
|
||||
int retry_cnt = 0;
|
||||
uint64_t old_flag = shm_helper->gpu_flag.load(std::memory_order_acquire);
|
||||
bool ShmWorker::try_lock_gpu(bool &out_self_hold) {
|
||||
static int retry_cnt = 0;
|
||||
|
||||
uint64_t old_flag =
|
||||
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
||||
if (unpack_lock_field(old_flag) == 0) { // free
|
||||
uint64_t new_flag = pack_locked_tgid(tgid);
|
||||
if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak(
|
||||
old_flag, new_flag, std::memory_order_acq_rel,
|
||||
std::memory_order_acquire)) {
|
||||
spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
|
||||
int32_t prev_tgid = unpack_tgid_field(old_flag);
|
||||
out_self_hold = prev_tgid == tgid;
|
||||
retry_cnt = 0;
|
||||
return true;
|
||||
}
|
||||
} else { // locked
|
||||
if (unpack_tgid_field(old_flag) == tgid) {
|
||||
spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
|
||||
out_self_hold = true;
|
||||
retry_cnt = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// failed
|
||||
if (++retry_cnt % 2000 == 0) {
|
||||
spdlog::info(
|
||||
"TGID {} trying to acquire GPU {} lock, current lock holder TGID {}",
|
||||
tgid, gpu_id, unpack_tgid_field(old_flag));
|
||||
}
|
||||
out_self_hold = false;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ShmWorker::lock_gpu(bool &out_self_hold) {
|
||||
while (true) {
|
||||
if (unpack_lock_field(old_flag) == 0) {
|
||||
uint64_t new_flag = pack_locked_tgid(tgid);
|
||||
if (shm_helper->gpu_flag.compare_exchange_weak(old_flag, new_flag,
|
||||
std::memory_order_acq_rel,
|
||||
std::memory_order_acquire)) {
|
||||
spdlog::info("TGID {} acquired GPU lock", tgid);
|
||||
int32_t old_tgid = unpack_tgid_field(old_flag);
|
||||
return old_tgid == tgid;
|
||||
}
|
||||
} else {
|
||||
if (unpack_tgid_field(old_flag) == tgid) {
|
||||
spdlog::info("TGID {} already holds the GPU lock", tgid);
|
||||
return true;
|
||||
}
|
||||
if (try_lock_gpu(out_self_hold)) {
|
||||
return true;
|
||||
}
|
||||
// failed
|
||||
++retry_cnt;
|
||||
if (retry_cnt % 1000 == 0) {
|
||||
spdlog::info(
|
||||
"TGID {} waiting for GPU lock, current lock holder TGID {}", tgid,
|
||||
unpack_tgid_field(old_flag));
|
||||
}
|
||||
usleep(1000);
|
||||
old_flag = shm_helper->gpu_flag.load(std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
|
||||
void ShmWorker::unlock_gpu() {
|
||||
uint64_t old_flag = shm_helper->gpu_flag.load(std::memory_order_acquire);
|
||||
uint64_t old_flag =
|
||||
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
||||
if (unpack_tgid_field(old_flag) != tgid) {
|
||||
spdlog::warn("previous gpu flag {} does not match expected locked flag for "
|
||||
"TGID {}. This may be a bug, unless during startup.",
|
||||
old_flag, tgid);
|
||||
// spdlog::warn("previous gpu flag {} does not match expected locked flag for "
|
||||
// "TGID {}. This may be a bug, unless during startup.",
|
||||
// old_flag, tgid);
|
||||
spdlog::info("TGID {} does not hold GPU {} lock", tgid, gpu_id);
|
||||
} else {
|
||||
uint64_t new_flag = pack_unlocked_tgid(tgid);
|
||||
shm_helper->gpu_flag.store(new_flag, std::memory_order_release);
|
||||
spdlog::info("TGID {} released GPU lock", tgid);
|
||||
shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release);
|
||||
spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,7 +162,7 @@ uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) {
|
||||
uint64_t response = shm_helper->request.response;
|
||||
// set ready to 0
|
||||
shm_helper->req_ready.store(ShmHelper::READY_STATE_NO_REQUEST,
|
||||
std::memory_order_release);
|
||||
std::memory_order_release);
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user