Files
xc-llm-kunlun/vllm_kunlun/csrc/vxpu_offload/vxpu_daemon.cpp
2026-02-12 10:46:37 +08:00

178 lines
5.2 KiB
C++

#include <iostream>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <atomic>
#include <signal.h>
#include "xpu_helper.h"
#include "shm_manager.h"
#include "spdlog/spdlog.h"
#include "xpu/runtime.h"
#include "xpu/xpuml.h"
static ShmManager *shm_manager = nullptr;
void handle_signal(int sig) {
if (shm_manager) {
shm_manager->stop_busy_loop();
}
}
void install_signal_handlers() {
struct sigaction sa{};
sa.sa_handler = handle_signal;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
sigaction(SIGINT, &sa, nullptr);
sigaction(SIGTERM, &sa, nullptr);
sigaction(SIGHUP, &sa, nullptr);
}
size_t get_reserved_vram_size() {
const char *env_p = std::getenv("VXPU_RESERVED_VRAM_SIZE_GB");
size_t reserved_vram_size = 8ul * 1024 * 1024 * 1024; // default 8GB
if (env_p) {
try {
size_t size_gb = std::stoul(env_p);
reserved_vram_size = size_gb * 1024 * 1024 * 1024;
} catch (const std::exception &e) {
spdlog::warn("Failed to parse VXPU_RESERVED_VRAM_SIZE_GB: {}, using "
"default 8GB",
e.what());
}
}
return reserved_vram_size;
}
void start_daemon() {
int device_count = 0;
int ret = xpu_device_count(&device_count);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"xpu_device_count failed with error code: " + std::to_string(ret) +
" " + __FILE__ + ":" + std::to_string(__LINE__));
}
std::vector<void *> dev_ptrs;
// shm
shm_manager = new ShmManager();
size_t reserved_vram_size = get_reserved_vram_size();
spdlog::info("Reserved gpu memory size per device: {:.1f} GB",
reserved_vram_size / (1024.0 * 1024 * 1024));
xpumlInit();
for (int i = 0; i < device_count; ++i) {
int ret = xpu_set_device(i);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"xpu_set_device failed with error code: " + std::to_string(ret) +
" " + __FILE__ + ":" + std::to_string(__LINE__));
}
uint64_t attr;
ret = xpu_device_get_attr(&attr, XPUATTR_PCI_ADDRESS, i);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"xpu_device_get_attr failed with error code: " + std::to_string(ret) +
" " + __FILE__ + ":" + std::to_string(__LINE__));
}
uint32_t pci_addr = static_cast<uint32_t>(attr);
spdlog::info("Setting up device id {} - {:04X}.{:02X}.{:02X}.{:X}", i,
((pci_addr >> 16) & 0xFFFF), ((pci_addr >> 8) & 0xFF),
((pci_addr >> 3) & 0x1F), (pci_addr & 0x7));
// get free memory size
xpumlDevice_t xpuml_device;
xpumlReturn_t ml_ret = xpumlDeviceGetHandleByIndex(i, &xpuml_device);
if (ml_ret != XPUML_SUCCESS) {
throw std::runtime_error(
"xpumlDeviceGetHandleByIndex failed with error code: " +
std::to_string(ml_ret) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
xpumlMemory_t mem_info;
ml_ret = xpumlDeviceGetMemoryInfo(xpuml_device, &mem_info);
if (ml_ret != XPUML_SUCCESS) {
throw std::runtime_error(
"xpumlDeviceGetMemoryInfo failed with error code: " +
std::to_string(ml_ret) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
size_t total_g_mem = mem_info.totalGlobalMemory;
size_t free_g_mem = mem_info.freeGlobalMemory;
size_t granularity = 2ul * 1024 * 1024; // 2MB
if (free_g_mem < reserved_vram_size) {
spdlog::error("Not enough free memory to reserve: {}, free_g_mem: {}",
reserved_vram_size, free_g_mem);
throw std::runtime_error("Not enough free memory to reserve");
}
size_t g_size =
(free_g_mem - reserved_vram_size) / granularity * granularity;
// allocate
void *dev_ptr = nullptr;
ret = xpu_malloc(&dev_ptr, g_size);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"xpu_malloc failed with error code: " + std::to_string(ret) + " " +
__FILE__ + ":" + std::to_string(__LINE__));
}
spdlog::info("device {} xpu_malloc succeeded, size: {}", i, g_size);
dev_ptrs.push_back(dev_ptr);
// get memhandle
XPUIpcMemHandle handle;
ret = xpu_ipc_get_memhandle(&handle, dev_ptr);
if (ret != XPU_SUCCESS) {
throw std::runtime_error(
"xpu_ipc_get_memhandle failed with error code: " +
std::to_string(ret) + " " + __FILE__ + ":" +
std::to_string(__LINE__));
}
// shm set gpu info
shm_manager->set_xpu_info(i, pci_addr, g_size, handle);
}
xpumlShutdown();
// start busy loop
shm_manager->run_busy_loop();
// stopped by signal
delete shm_manager;
shm_manager = nullptr;
// free physical memory
for (int i = 0; i < device_count; ++i) {
int ret = xpu_set_device(i);
if (ret != XPU_SUCCESS) {
spdlog::error("xpu_set_device failed during cleanup, error code: {}",
ret);
continue;
}
ret = xpu_free(dev_ptrs[i]);
if (ret != XPU_SUCCESS) {
spdlog::error("xpu_free failed during cleanup, error code: {}", ret);
} else {
spdlog::info("device {} xpu_free succeeded during cleanup", i);
}
}
dev_ptrs.clear();
}
int main() {
install_signal_handlers();
start_daemon();
return 0;
}