178 lines
5.2 KiB
C++
178 lines
5.2 KiB
C++
|
|
#include <iostream>
|
||
|
|
#include <sys/types.h>
|
||
|
|
|
||
|
|
#include <sys/mman.h>
|
||
|
|
#include <sys/stat.h>
|
||
|
|
#include <fcntl.h>
|
||
|
|
#include <unistd.h>
|
||
|
|
#include <string.h>
|
||
|
|
#include <vector>
|
||
|
|
#include <atomic>
|
||
|
|
#include <signal.h>
|
||
|
|
|
||
|
|
#include "xpu_helper.h"
|
||
|
|
#include "shm_manager.h"
|
||
|
|
#include "spdlog/spdlog.h"
|
||
|
|
|
||
|
|
#include "xpu/runtime.h"
|
||
|
|
#include "xpu/xpuml.h"
|
||
|
|
|
||
|
|
|
||
|
|
static ShmManager *shm_manager = nullptr;
|
||
|
|
|
||
|
|
void handle_signal(int sig) {
|
||
|
|
if (shm_manager) {
|
||
|
|
shm_manager->stop_busy_loop();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
void install_signal_handlers() {
|
||
|
|
struct sigaction sa{};
|
||
|
|
sa.sa_handler = handle_signal;
|
||
|
|
sigemptyset(&sa.sa_mask);
|
||
|
|
sa.sa_flags = 0;
|
||
|
|
|
||
|
|
sigaction(SIGINT, &sa, nullptr);
|
||
|
|
sigaction(SIGTERM, &sa, nullptr);
|
||
|
|
sigaction(SIGHUP, &sa, nullptr);
|
||
|
|
}
|
||
|
|
|
||
|
|
size_t get_reserved_vram_size() {
|
||
|
|
const char *env_p = std::getenv("VXPU_RESERVED_VRAM_SIZE_GB");
|
||
|
|
size_t reserved_vram_size = 8ul * 1024 * 1024 * 1024; // default 8GB
|
||
|
|
if (env_p) {
|
||
|
|
try {
|
||
|
|
size_t size_gb = std::stoul(env_p);
|
||
|
|
reserved_vram_size = size_gb * 1024 * 1024 * 1024;
|
||
|
|
} catch (const std::exception &e) {
|
||
|
|
spdlog::warn("Failed to parse VXPU_RESERVED_VRAM_SIZE_GB: {}, using "
|
||
|
|
"default 8GB",
|
||
|
|
e.what());
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return reserved_vram_size;
|
||
|
|
}
|
||
|
|
|
||
|
|
void start_daemon() {
|
||
|
|
int device_count = 0;
|
||
|
|
int ret = xpu_device_count(&device_count);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpu_device_count failed with error code: " + std::to_string(ret) +
|
||
|
|
" " + __FILE__ + ":" + std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
|
||
|
|
std::vector<void *> dev_ptrs;
|
||
|
|
// shm
|
||
|
|
shm_manager = new ShmManager();
|
||
|
|
|
||
|
|
size_t reserved_vram_size = get_reserved_vram_size();
|
||
|
|
spdlog::info("Reserved gpu memory size per device: {:.1f} GB",
|
||
|
|
reserved_vram_size / (1024.0 * 1024 * 1024));
|
||
|
|
|
||
|
|
xpumlInit();
|
||
|
|
for (int i = 0; i < device_count; ++i) {
|
||
|
|
int ret = xpu_set_device(i);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpu_set_device failed with error code: " + std::to_string(ret) +
|
||
|
|
" " + __FILE__ + ":" + std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
uint64_t attr;
|
||
|
|
ret = xpu_device_get_attr(&attr, XPUATTR_PCI_ADDRESS, i);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpu_device_get_attr failed with error code: " + std::to_string(ret) +
|
||
|
|
" " + __FILE__ + ":" + std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
uint32_t pci_addr = static_cast<uint32_t>(attr);
|
||
|
|
spdlog::info("Setting up device id {} - {:04X}.{:02X}.{:02X}.{:X}", i,
|
||
|
|
((pci_addr >> 16) & 0xFFFF), ((pci_addr >> 8) & 0xFF),
|
||
|
|
((pci_addr >> 3) & 0x1F), (pci_addr & 0x7));
|
||
|
|
|
||
|
|
// get free memory size
|
||
|
|
xpumlDevice_t xpuml_device;
|
||
|
|
xpumlReturn_t ml_ret = xpumlDeviceGetHandleByIndex(i, &xpuml_device);
|
||
|
|
if (ml_ret != XPUML_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpumlDeviceGetHandleByIndex failed with error code: " +
|
||
|
|
std::to_string(ml_ret) + " " + __FILE__ + ":" +
|
||
|
|
std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
xpumlMemory_t mem_info;
|
||
|
|
ml_ret = xpumlDeviceGetMemoryInfo(xpuml_device, &mem_info);
|
||
|
|
if (ml_ret != XPUML_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpumlDeviceGetMemoryInfo failed with error code: " +
|
||
|
|
std::to_string(ml_ret) + " " + __FILE__ + ":" +
|
||
|
|
std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
size_t total_g_mem = mem_info.totalGlobalMemory;
|
||
|
|
size_t free_g_mem = mem_info.freeGlobalMemory;
|
||
|
|
size_t granularity = 2ul * 1024 * 1024; // 2MB
|
||
|
|
if (free_g_mem < reserved_vram_size) {
|
||
|
|
spdlog::error("Not enough free memory to reserve: {}, free_g_mem: {}",
|
||
|
|
reserved_vram_size, free_g_mem);
|
||
|
|
throw std::runtime_error("Not enough free memory to reserve");
|
||
|
|
}
|
||
|
|
size_t g_size =
|
||
|
|
(free_g_mem - reserved_vram_size) / granularity * granularity;
|
||
|
|
// allocate
|
||
|
|
void *dev_ptr = nullptr;
|
||
|
|
ret = xpu_malloc(&dev_ptr, g_size);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpu_malloc failed with error code: " + std::to_string(ret) + " " +
|
||
|
|
__FILE__ + ":" + std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
spdlog::info("device {} xpu_malloc succeeded, size: {}", i, g_size);
|
||
|
|
dev_ptrs.push_back(dev_ptr);
|
||
|
|
|
||
|
|
// get memhandle
|
||
|
|
XPUIpcMemHandle handle;
|
||
|
|
ret = xpu_ipc_get_memhandle(&handle, dev_ptr);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
throw std::runtime_error(
|
||
|
|
"xpu_ipc_get_memhandle failed with error code: " +
|
||
|
|
std::to_string(ret) + " " + __FILE__ + ":" +
|
||
|
|
std::to_string(__LINE__));
|
||
|
|
}
|
||
|
|
|
||
|
|
// shm set gpu info
|
||
|
|
shm_manager->set_xpu_info(i, pci_addr, g_size, handle);
|
||
|
|
}
|
||
|
|
xpumlShutdown();
|
||
|
|
|
||
|
|
// start busy loop
|
||
|
|
shm_manager->run_busy_loop();
|
||
|
|
|
||
|
|
// stopped by signal
|
||
|
|
delete shm_manager;
|
||
|
|
shm_manager = nullptr;
|
||
|
|
|
||
|
|
// free physical memory
|
||
|
|
for (int i = 0; i < device_count; ++i) {
|
||
|
|
int ret = xpu_set_device(i);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
spdlog::error("xpu_set_device failed during cleanup, error code: {}",
|
||
|
|
ret);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
ret = xpu_free(dev_ptrs[i]);
|
||
|
|
if (ret != XPU_SUCCESS) {
|
||
|
|
spdlog::error("xpu_free failed during cleanup, error code: {}", ret);
|
||
|
|
} else {
|
||
|
|
spdlog::info("device {} xpu_free succeeded during cleanup", i);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
dev_ptrs.clear();
|
||
|
|
}
|
||
|
|
|
||
|
|
int main() {
|
||
|
|
install_signal_handlers();
|
||
|
|
|
||
|
|
start_daemon();
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|