support multi npu partially

This commit is contained in:
starkwj
2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions

View File

@@ -13,6 +13,7 @@
#include "acl/acl.h"
#include "shm_manager.h"
#include "npu_helper.h"
#include "spdlog/spdlog.h"
@@ -49,8 +50,6 @@ void ensure_context(unsigned long long device) {
void init_acl() {
int32_t deviceId=0;
// aclrtStream stream;
bool g_isDevice;
aclError ret = aclrtSetDevice(deviceId);
if (ret != ACL_ERROR_NONE) {
@@ -59,7 +58,8 @@ void init_acl() {
}
}
void reset_pids(const std::vector<int32_t> &pids, uint64_t shareable_handle) {
void reset_pids(const std::vector<int32_t> &pids,
const std::vector<uint64_t> &shareable_handles) {
int cnt = pids.size();
if (cnt <= 0) {
return;
@@ -68,21 +68,21 @@ void reset_pids(const std::vector<int32_t> &pids, uint64_t shareable_handle) {
int32_t pids_data[cnt];
memcpy(pids_data, pids.data(), cnt * sizeof(int32_t));
aclError error_code =
aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt);
if (error_code != 0) {
spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemSetPidToShareableHandle failed");
} else {
spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}",
cnt);
for (int i = 0; i < shareable_handles.size(); ++i) {
uint64_t shareable_handle = shareable_handles[i];
aclError error_code =
aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt);
if (error_code != 0) {
spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemSetPidToShareableHandle failed");
}
}
spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}", cnt);
}
void start_daemon() {
init_acl();
void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
size_t &out_g_size) {
aclError error_code;
size_t free_mem = 0, total = 0;
error_code = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total);
@@ -94,12 +94,11 @@ void start_daemon() {
total);
}
uint32_t device = 0;
aclrtPhysicalMemProp prop = {};
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
prop.memAttr = ACL_HBM_MEM_HUGE;
prop.location.id = device;
prop.location.id = device_id;
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
prop.reserve = 0;
@@ -107,7 +106,8 @@ void start_daemon() {
error_code = aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, &granularity);
if (error_code != 0) {
spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}", error_code);
spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
} else {
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}",
@@ -118,59 +118,68 @@ void start_daemon() {
reserved_mem_size, free_mem);
throw std::runtime_error("Not enough free memory to reserve");
}
size_t g_size = free_mem - reserved_mem_size;
g_size = (g_size / granularity) * granularity;
out_g_size = free_mem - reserved_mem_size;
out_g_size = (out_g_size / granularity) * granularity;
// allocate physical memory
aclrtDrvMemHandle mem_handle;
error_code = aclrtMallocPhysical(&mem_handle, g_size, &prop, 0);
error_code = aclrtMallocPhysical(&out_mem_handle, out_g_size, &prop, 0);
if (error_code != 0) {
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtMallocPhysical failed");
} else {
spdlog::info("aclrtMallocPhysical succeeded, size: {}", g_size);
spdlog::info("device {} aclrtMallocPhysical succeeded, size: {}", device_id,
out_g_size);
}
}
// // reserve address
// void *vmem_addr = nullptr;
// error_code = aclrtReserveMemAddress(&vmem_addr, g_size, 0, nullptr, 0);
// if (error_code != 0) {
// spdlog::error("aclrtReserveMemAddress failed, error_code: {}", error_code);
// throw std::runtime_error("aclrtReserveMemAddress failed");
// } else {
// spdlog::info("aclrtReserveMemAddress succeeded, vmem_addr: {}", vmem_addr);
// }
// // map
// error_code = aclrtMapMem(vmem_addr, g_size, 0, mem_handle, 0);
// if (error_code != 0) {
// spdlog::error("aclrtMapMem failed, error_code: {}", error_code);
// throw std::runtime_error("aclrtMapMem failed");
// } else {
// spdlog::info("aclrtMapMem succeeded, vmem_addr: {}", vmem_addr);
// }
// export
uint64_t shareable_handle;
error_code = aclrtMemExportToShareableHandle(
mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT,
&shareable_handle);
if (error_code != 0) {
spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemExportToShareableHandle failed");
} else {
spdlog::info(
"aclrtMemExportToShareableHandle succeeded, shareable_handle: {}",
shareable_handle);
}
void start_daemon() {
init_acl();
std::vector<int> npu_ids = get_npu_ids();
std::vector<aclrtDrvMemHandle> mem_handles;
std::vector<uint64_t> shareable_handles;
// shm
shm_manager = new ShmManager();
shm_manager->set_gpu_info(g_size, shareable_handle);
for (int i = 0; i < npu_ids.size(); ++i) {
uint32_t device_id = i;
int npu_id = npu_ids[i];
spdlog::info("Setting up device id {} - npu id {}", device_id, npu_id);
aclError error_code = aclrtSetDevice(device_id);
if (error_code != ACL_ERROR_NONE) {
throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
std::to_string(error_code) + " " + __FILE__ +
":" + std::to_string(__LINE__));
}
// alloc physical
aclrtDrvMemHandle mem_handle;
size_t g_size;
alloc_physical(device_id, mem_handle, g_size);
mem_handles.push_back(mem_handle);
// export
uint64_t shareable_handle;
error_code = aclrtMemExportToShareableHandle(
mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT,
&shareable_handle);
if (error_code != 0) {
spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}",
error_code);
throw std::runtime_error("aclrtMemExportToShareableHandle failed");
} else {
spdlog::info(
"aclrtMemExportToShareableHandle succeeded, shareable_handle: {}",
shareable_handle);
}
shm_manager->set_gpu_info(npu_id, g_size, shareable_handle);
shareable_handles.push_back(shareable_handle);
}
shm_manager->register_callback_on_worker_change(
[&](const std::vector<int32_t> &pids) {
reset_pids(pids, shareable_handle);
reset_pids(pids, shareable_handles);
});
// start busy loop
@@ -181,10 +190,12 @@ void start_daemon() {
shm_manager = nullptr;
// free physical memory
error_code = aclrtFreePhysical(mem_handle);
if (error_code != 0) {
spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtFreePhysical failed");
for (auto mem_handle : mem_handles) {
aclError error_code = aclrtFreePhysical(mem_handle);
if (error_code != 0) {
spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code);
throw std::runtime_error("aclrtFreePhysical failed");
}
}
}