2025-12-26 07:37:35 +00:00
|
|
|
#include <iostream>
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
|
|
|
#include <sys/mman.h>
|
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <vector>
|
|
|
|
|
#include <atomic>
|
2026-02-11 06:27:58 +00:00
|
|
|
#include <mutex>
|
2025-12-26 07:37:35 +00:00
|
|
|
#include <signal.h>
|
|
|
|
|
|
|
|
|
|
#include "acl/acl.h"
|
|
|
|
|
|
|
|
|
|
#include "shm_manager.h"
|
2026-01-08 06:54:33 +00:00
|
|
|
#include "npu_helper.h"
|
2025-12-26 07:37:35 +00:00
|
|
|
#include "spdlog/spdlog.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static ShmManager *shm_manager = nullptr;
|
|
|
|
|
|
|
|
|
|
void handle_signal(int sig) {
|
|
|
|
|
if (shm_manager) {
|
|
|
|
|
shm_manager->stop_busy_loop();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void install_signal_handlers() {
|
2026-02-11 06:27:58 +00:00
|
|
|
struct sigaction sa{};
|
|
|
|
|
sa.sa_handler = handle_signal;
|
|
|
|
|
sigemptyset(&sa.sa_mask);
|
|
|
|
|
sa.sa_flags = 0;
|
|
|
|
|
|
|
|
|
|
sigaction(SIGINT, &sa, nullptr);
|
|
|
|
|
sigaction(SIGTERM, &sa, nullptr);
|
|
|
|
|
sigaction(SIGHUP, &sa, nullptr);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t get_reserved_vram_size() {
|
|
|
|
|
static std::once_flag flag;
|
|
|
|
|
static size_t reserved_vram_size = 8ul * 1024 * 1024 * 1024; // default 8GB
|
|
|
|
|
|
|
|
|
|
std::call_once(flag, []() {
|
|
|
|
|
const char *env_p = std::getenv("VNPU_RESERVED_VRAM_SIZE_GB");
|
|
|
|
|
if (env_p) {
|
|
|
|
|
try {
|
|
|
|
|
size_t size_gb = std::stoul(env_p);
|
|
|
|
|
reserved_vram_size = size_gb * 1024 * 1024 * 1024;
|
|
|
|
|
} catch (const std::exception &e) {
|
|
|
|
|
spdlog::warn("Failed to parse VNPU_RESERVED_VRAM_SIZE_GB: {}, using "
|
|
|
|
|
"default 8GB",
|
|
|
|
|
e.what());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
return reserved_vram_size;
|
2025-12-26 07:37:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ensure_context(unsigned long long device) {
|
|
|
|
|
aclrtContext pctx;
|
|
|
|
|
aclrtGetCurrentContext(&pctx);
|
|
|
|
|
if (!pctx) {
|
|
|
|
|
// Ensure device context.
|
|
|
|
|
aclrtCreateContext(&pctx, device);
|
|
|
|
|
aclrtSetCurrentContext(pctx);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void init_acl() {
|
|
|
|
|
int32_t deviceId=0;
|
|
|
|
|
|
|
|
|
|
aclError ret = aclrtSetDevice(deviceId);
|
|
|
|
|
if (ret != ACL_ERROR_NONE) {
|
|
|
|
|
throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
|
|
|
|
|
std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-08 06:54:33 +00:00
|
|
|
void reset_pids(const std::vector<int32_t> &pids,
|
|
|
|
|
const std::vector<uint64_t> &shareable_handles) {
|
2025-12-26 07:37:35 +00:00
|
|
|
int cnt = pids.size();
|
|
|
|
|
if (cnt <= 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t pids_data[cnt];
|
|
|
|
|
memcpy(pids_data, pids.data(), cnt * sizeof(int32_t));
|
|
|
|
|
|
2026-01-08 06:54:33 +00:00
|
|
|
for (int i = 0; i < shareable_handles.size(); ++i) {
|
|
|
|
|
uint64_t shareable_handle = shareable_handles[i];
|
|
|
|
|
aclError error_code =
|
|
|
|
|
aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt);
|
|
|
|
|
if (error_code != 0) {
|
|
|
|
|
spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}",
|
|
|
|
|
error_code);
|
|
|
|
|
throw std::runtime_error("aclrtMemSetPidToShareableHandle failed");
|
|
|
|
|
}
|
2025-12-26 07:37:35 +00:00
|
|
|
}
|
2026-01-08 06:54:33 +00:00
|
|
|
spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}", cnt);
|
2025-12-26 07:37:35 +00:00
|
|
|
}
|
|
|
|
|
|
2026-01-08 06:54:33 +00:00
|
|
|
void alloc_physical(uint32_t device_id, aclrtDrvMemHandle &out_mem_handle,
|
|
|
|
|
size_t &out_g_size) {
|
2025-12-26 07:37:35 +00:00
|
|
|
aclError error_code;
|
|
|
|
|
size_t free_mem = 0, total = 0;
|
|
|
|
|
error_code = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total);
|
|
|
|
|
if (error_code != 0) {
|
|
|
|
|
spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code);
|
|
|
|
|
throw std::runtime_error("aclrtGetMemInfo failed");
|
|
|
|
|
} else {
|
|
|
|
|
spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem,
|
|
|
|
|
total);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
aclrtPhysicalMemProp prop = {};
|
|
|
|
|
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
|
|
|
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
|
|
|
prop.memAttr = ACL_HBM_MEM_HUGE;
|
2026-01-08 06:54:33 +00:00
|
|
|
prop.location.id = device_id;
|
2025-12-26 07:37:35 +00:00
|
|
|
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
|
prop.reserve = 0;
|
|
|
|
|
|
|
|
|
|
size_t granularity;
|
|
|
|
|
error_code = aclrtMemGetAllocationGranularity(
|
|
|
|
|
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, &granularity);
|
|
|
|
|
if (error_code != 0) {
|
2026-01-08 06:54:33 +00:00
|
|
|
spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}",
|
|
|
|
|
error_code);
|
2025-12-26 07:37:35 +00:00
|
|
|
throw std::runtime_error("aclrtMemGetAllocationGranularity failed");
|
|
|
|
|
} else {
|
|
|
|
|
spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}",
|
|
|
|
|
granularity);
|
|
|
|
|
}
|
2026-02-11 06:27:58 +00:00
|
|
|
size_t reserved_mem_size = get_reserved_vram_size();
|
2025-12-26 07:37:35 +00:00
|
|
|
if (free_mem < reserved_mem_size) {
|
|
|
|
|
spdlog::error("Not enough free memory to reserve: {}, free_mem: {}",
|
|
|
|
|
reserved_mem_size, free_mem);
|
|
|
|
|
throw std::runtime_error("Not enough free memory to reserve");
|
|
|
|
|
}
|
2026-01-08 06:54:33 +00:00
|
|
|
out_g_size = free_mem - reserved_mem_size;
|
|
|
|
|
out_g_size = (out_g_size / granularity) * granularity;
|
2025-12-26 07:37:35 +00:00
|
|
|
|
|
|
|
|
// allocate physical memory
|
2026-01-08 06:54:33 +00:00
|
|
|
error_code = aclrtMallocPhysical(&out_mem_handle, out_g_size, &prop, 0);
|
2025-12-26 07:37:35 +00:00
|
|
|
if (error_code != 0) {
|
|
|
|
|
spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code);
|
|
|
|
|
throw std::runtime_error("aclrtMallocPhysical failed");
|
|
|
|
|
} else {
|
2026-01-08 06:54:33 +00:00
|
|
|
spdlog::info("device {} aclrtMallocPhysical succeeded, size: {}", device_id,
|
|
|
|
|
out_g_size);
|
2025-12-26 07:37:35 +00:00
|
|
|
}
|
2026-01-08 06:54:33 +00:00
|
|
|
}
|
2025-12-26 07:37:35 +00:00
|
|
|
|
2026-01-08 06:54:33 +00:00
|
|
|
void start_daemon() {
|
|
|
|
|
init_acl();
|
2025-12-26 07:37:35 +00:00
|
|
|
|
2026-01-08 06:54:33 +00:00
|
|
|
std::vector<int> npu_ids = get_npu_ids();
|
|
|
|
|
std::vector<aclrtDrvMemHandle> mem_handles;
|
|
|
|
|
std::vector<uint64_t> shareable_handles;
|
2025-12-26 07:37:35 +00:00
|
|
|
// shm
|
|
|
|
|
shm_manager = new ShmManager();
|
2026-01-08 06:54:33 +00:00
|
|
|
|
|
|
|
|
for (int i = 0; i < npu_ids.size(); ++i) {
|
|
|
|
|
uint32_t device_id = i;
|
|
|
|
|
int npu_id = npu_ids[i];
|
|
|
|
|
spdlog::info("Setting up device id {} - npu id {}", device_id, npu_id);
|
|
|
|
|
aclError error_code = aclrtSetDevice(device_id);
|
|
|
|
|
if (error_code != ACL_ERROR_NONE) {
|
|
|
|
|
throw std::runtime_error("aclrtSetDevice failed with acl error code: " +
|
|
|
|
|
std::to_string(error_code) + " " + __FILE__ +
|
|
|
|
|
":" + std::to_string(__LINE__));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// alloc physical
|
|
|
|
|
aclrtDrvMemHandle mem_handle;
|
|
|
|
|
size_t g_size;
|
|
|
|
|
alloc_physical(device_id, mem_handle, g_size);
|
|
|
|
|
mem_handles.push_back(mem_handle);
|
|
|
|
|
|
|
|
|
|
// export
|
|
|
|
|
uint64_t shareable_handle;
|
|
|
|
|
error_code = aclrtMemExportToShareableHandle(
|
|
|
|
|
mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT,
|
|
|
|
|
&shareable_handle);
|
|
|
|
|
if (error_code != 0) {
|
|
|
|
|
spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}",
|
|
|
|
|
error_code);
|
|
|
|
|
throw std::runtime_error("aclrtMemExportToShareableHandle failed");
|
|
|
|
|
} else {
|
|
|
|
|
spdlog::info(
|
|
|
|
|
"aclrtMemExportToShareableHandle succeeded, shareable_handle: {}",
|
|
|
|
|
shareable_handle);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
shm_manager->set_gpu_info(npu_id, g_size, shareable_handle);
|
|
|
|
|
shareable_handles.push_back(shareable_handle);
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-26 07:37:35 +00:00
|
|
|
shm_manager->register_callback_on_worker_change(
|
|
|
|
|
[&](const std::vector<int32_t> &pids) {
|
2026-01-08 06:54:33 +00:00
|
|
|
reset_pids(pids, shareable_handles);
|
2025-12-26 07:37:35 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// start busy loop
|
|
|
|
|
shm_manager->run_busy_loop();
|
|
|
|
|
|
|
|
|
|
// stopped by signal
|
|
|
|
|
delete shm_manager;
|
|
|
|
|
shm_manager = nullptr;
|
|
|
|
|
|
|
|
|
|
// free physical memory
|
2026-01-08 06:54:33 +00:00
|
|
|
for (auto mem_handle : mem_handles) {
|
|
|
|
|
aclError error_code = aclrtFreePhysical(mem_handle);
|
|
|
|
|
if (error_code != 0) {
|
|
|
|
|
spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code);
|
|
|
|
|
throw std::runtime_error("aclrtFreePhysical failed");
|
|
|
|
|
}
|
2025-12-26 07:37:35 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int main() {
|
|
|
|
|
install_signal_handlers();
|
|
|
|
|
|
|
|
|
|
start_daemon();
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|