support multi npu partially

This commit is contained in:
starkwj
2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions

View File

@@ -1,7 +1,6 @@
#include "shm_manager.h"
#include <algorithm>
// === ShmManager ===
ShmManager::ShmManager() {
std::string shm_name = get_shm_name();
@@ -37,11 +36,12 @@ ShmManager::~ShmManager() {
shm_unlink(shm_name.c_str());
}
void ShmManager::set_gpu_info(uint64_t vmem_size, uint64_t shared_handle) {
shm_helper->set_gpu_info(vmem_size, shared_handle);
void ShmManager::set_gpu_info(int gpu_id, uint64_t vmem_size,
uint64_t shared_handle) {
shm_helper->set_gpu_info(gpu_id, vmem_size, shared_handle);
this->valid_gpu_ids.push_back(gpu_id);
}
void ShmManager::run_busy_loop() {
if (!cb_on_worker_change) {
spdlog::error("cb_on_worker_change is not set");
@@ -155,14 +155,17 @@ void ShmManager::check_heart_beats() {
shm_helper->heart_beats[i].timestamp.store(0,
std::memory_order_release);
// check dead lock
uint64_t gpu_flag =
shm_helper->gpu_flag.load(std::memory_order_acquire);
if (unpack_lock_field(gpu_flag) == 1 &&
unpack_tgid_field(gpu_flag) == tgid) {
// release lock held by dead worker
spdlog::warn("Releasing GPU lock held by dead worker TGID {}", tgid);
shm_helper->gpu_flag.store(pack_unlocked_tgid(tgid),
std::memory_order_release);
for (int gpu_id : valid_gpu_ids) {
uint64_t gpu_flag =
shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
if (unpack_lock_field(gpu_flag) == 1 &&
unpack_tgid_field(gpu_flag) == tgid) {
// release lock held by dead worker
spdlog::warn("Releasing GPU {} lock held by dead worker TGID {}",
gpu_id, tgid);
shm_helper->gpu_flag[gpu_id].store(pack_unlocked_tgid(tgid),
std::memory_order_release);
}
}
local_worker_tgids[i] = 0;
alive_worker_tgids.erase(std::remove(alive_worker_tgids.begin(),