#include #include #include #include #include #include #include #include #include #include #include "xpu_helper.h" #include "shm_manager.h" #include "spdlog/spdlog.h" #include "xpu/runtime.h" #include "xpu/xpuml.h" static ShmManager *shm_manager = nullptr; void handle_signal(int sig) { if (shm_manager) { shm_manager->stop_busy_loop(); } } void install_signal_handlers() { struct sigaction sa{}; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGINT, &sa, nullptr); sigaction(SIGTERM, &sa, nullptr); sigaction(SIGHUP, &sa, nullptr); } size_t get_reserved_vram_size() { const char *env_p = std::getenv("VXPU_RESERVED_VRAM_SIZE_GB"); size_t reserved_vram_size = 8ul * 1024 * 1024 * 1024; // default 8GB if (env_p) { try { size_t size_gb = std::stoul(env_p); reserved_vram_size = size_gb * 1024 * 1024 * 1024; } catch (const std::exception &e) { spdlog::warn("Failed to parse VXPU_RESERVED_VRAM_SIZE_GB: {}, using " "default 8GB", e.what()); } } return reserved_vram_size; } void start_daemon() { int device_count = 0; int ret = xpu_device_count(&device_count); if (ret != XPU_SUCCESS) { throw std::runtime_error( "xpu_device_count failed with error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } std::vector dev_ptrs; // shm shm_manager = new ShmManager(); size_t reserved_vram_size = get_reserved_vram_size(); spdlog::info("Reserved gpu memory size per device: {:.1f} GB", reserved_vram_size / (1024.0 * 1024 * 1024)); xpumlInit(); for (int i = 0; i < device_count; ++i) { int ret = xpu_set_device(i); if (ret != XPU_SUCCESS) { throw std::runtime_error( "xpu_set_device failed with error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } uint64_t attr; ret = xpu_device_get_attr(&attr, XPUATTR_PCI_ADDRESS, i); if (ret != XPU_SUCCESS) { throw std::runtime_error( "xpu_device_get_attr failed with error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } uint32_t pci_addr = static_cast(attr); spdlog::info("Setting up device id {} - {:04X}.{:02X}.{:02X}.{:X}", i, ((pci_addr >> 16) & 0xFFFF), ((pci_addr >> 8) & 0xFF), ((pci_addr >> 3) & 0x1F), (pci_addr & 0x7)); // get free memory size xpumlDevice_t xpuml_device; xpumlReturn_t ml_ret = xpumlDeviceGetHandleByIndex(i, &xpuml_device); if (ml_ret != XPUML_SUCCESS) { throw std::runtime_error( "xpumlDeviceGetHandleByIndex failed with error code: " + std::to_string(ml_ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } xpumlMemory_t mem_info; ml_ret = xpumlDeviceGetMemoryInfo(xpuml_device, &mem_info); if (ml_ret != XPUML_SUCCESS) { throw std::runtime_error( "xpumlDeviceGetMemoryInfo failed with error code: " + std::to_string(ml_ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } size_t total_g_mem = mem_info.totalGlobalMemory; size_t free_g_mem = mem_info.freeGlobalMemory; size_t granularity = 2ul * 1024 * 1024; // 2MB if (free_g_mem < reserved_vram_size) { spdlog::error("Not enough free memory to reserve: {}, free_g_mem: {}", reserved_vram_size, free_g_mem); throw std::runtime_error("Not enough free memory to reserve"); } size_t g_size = (free_g_mem - reserved_vram_size) / granularity * granularity; // allocate void *dev_ptr = nullptr; ret = xpu_malloc(&dev_ptr, g_size); if (ret != XPU_SUCCESS) { throw std::runtime_error( "xpu_malloc failed with error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } spdlog::info("device {} xpu_malloc succeeded, size: {}", i, g_size); dev_ptrs.push_back(dev_ptr); // get memhandle XPUIpcMemHandle handle; ret = xpu_ipc_get_memhandle(&handle, dev_ptr); if (ret != XPU_SUCCESS) { throw std::runtime_error( "xpu_ipc_get_memhandle failed with error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } // shm set gpu info shm_manager->set_xpu_info(i, pci_addr, g_size, handle); } xpumlShutdown(); // start busy loop shm_manager->run_busy_loop(); // stopped by signal delete shm_manager; shm_manager = nullptr; // free physical memory for (int i = 0; i < device_count; ++i) { int ret = xpu_set_device(i); if (ret != XPU_SUCCESS) { spdlog::error("xpu_set_device failed during cleanup, error code: {}", ret); continue; } ret = xpu_free(dev_ptrs[i]); if (ret != XPU_SUCCESS) { spdlog::error("xpu_free failed during cleanup, error code: {}", ret); } else { spdlog::info("device {} xpu_free succeeded during cleanup", i); } } dev_ptrs.clear(); } int main() { install_signal_handlers(); start_daemon(); return 0; }