support multi npu partially
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
#include "shm_manager.h"
|
||||
#include <algorithm>
|
||||
|
||||
// === ShmManager ===
|
||||
|
||||
ShmManager::ShmManager() {
|
||||
std::string shm_name = get_shm_name();
|
||||
@@ -37,11 +36,12 @@ ShmManager::~ShmManager() {
|
||||
shm_unlink(shm_name.c_str());
|
||||
}
|
||||
|
||||
void ShmManager::set_gpu_info(uint64_t vmem_size, uint64_t shared_handle) {
|
||||
shm_helper->set_gpu_info(vmem_size, shared_handle);
|
||||
void ShmManager::set_gpu_info(int gpu_id, uint64_t vmem_size,
|
||||
uint64_t shared_handle) {
|
||||
shm_helper->set_gpu_info(gpu_id, vmem_size, shared_handle);
|
||||
this->valid_gpu_ids.push_back(gpu_id);
|
||||
}
|
||||
|
||||
|
||||
void ShmManager::run_busy_loop() {
|
||||
if (!cb_on_worker_change) {
|
||||
spdlog::error("cb_on_worker_change is not set");
|
||||
@@ -155,14 +155,17 @@ void ShmManager::check_heart_beats() {
|
||||
shm_helper->heart_beats[i].timestamp.store(0,
|
||||
std::memory_order_release);
|
||||
// check dead lock
|
||||
uint64_t gpu_flag =
|
||||
shm_helper->gpu_flag.load(std::memory_order_acquire);
|
||||
if (unpack_lock_field(gpu_flag) == 1 &&
|
||||
unpack_tgid_field(gpu_flag) == tgid) {
|
||||
// release lock held by dead worker
|
||||
spdlog::warn("Releasing GPU lock held by dead worker TGID {}", tgid);
|
||||
shm_helper->gpu_flag.store(pack_unlocked_tgid(tgid),
|
||||
std::memory_order_release);
|
||||
for (int gpu_id : valid_gpu_ids) {
|
||||
uint64_t gpu_flag =
|
||||
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
|
||||
if (unpack_lock_field(gpu_flag) == 1 &&
|
||||
unpack_tgid_field(gpu_flag) == tgid) {
|
||||
// release lock held by dead worker
|
||||
spdlog::warn("Releasing GPU {} lock held by dead worker TGID {}",
|
||||
gpu_id, tgid);
|
||||
shm_helper->gpu_flag[gpu_id].store(pack_unlocked_tgid(tgid),
|
||||
std::memory_order_release);
|
||||
}
|
||||
}
|
||||
local_worker_tgids[i] = 0;
|
||||
alive_worker_tgids.erase(std::remove(alive_worker_tgids.begin(),
|
||||
|
||||
Reference in New Issue
Block a user