#include #include #include #include #include #include #include #include #include #include #include "acl/acl.h" #include "shm_manager.h" #include "spdlog/spdlog.h" static constexpr size_t reserved_mem_size = 8ul * 1024 * 1024 * 1024; // 8GB static ShmManager *shm_manager = nullptr; void handle_signal(int sig) { if (shm_manager) { shm_manager->stop_busy_loop(); } } void install_signal_handlers() { struct sigaction sa{}; sa.sa_handler = handle_signal; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGINT, &sa, nullptr); sigaction(SIGTERM, &sa, nullptr); sigaction(SIGHUP, &sa, nullptr); } void ensure_context(unsigned long long device) { aclrtContext pctx; aclrtGetCurrentContext(&pctx); if (!pctx) { // Ensure device context. aclrtCreateContext(&pctx, device); aclrtSetCurrentContext(pctx); } } void init_acl() { int32_t deviceId=0; // aclrtStream stream; bool g_isDevice; aclError ret = aclrtSetDevice(deviceId); if (ret != ACL_ERROR_NONE) { throw std::runtime_error("aclrtSetDevice failed with acl error code: " + std::to_string(ret) + " " + __FILE__ + ":" + std::to_string(__LINE__)); } } void reset_pids(const std::vector &pids, uint64_t shareable_handle) { int cnt = pids.size(); if (cnt <= 0) { return; } int32_t pids_data[cnt]; memcpy(pids_data, pids.data(), cnt * sizeof(int32_t)); aclError error_code = aclrtMemSetPidToShareableHandle(shareable_handle, pids_data, cnt); if (error_code != 0) { spdlog::error("aclrtMemSetPidToShareableHandle failed, error_code: {}", error_code); throw std::runtime_error("aclrtMemSetPidToShareableHandle failed"); } else { spdlog::info("aclrtMemSetPidToShareableHandle succeeded, num_pids: {}", cnt); } } void start_daemon() { init_acl(); aclError error_code; size_t free_mem = 0, total = 0; error_code = aclrtGetMemInfo(ACL_HBM_MEM, &free_mem, &total); if (error_code != 0) { spdlog::error("aclrtGetMemInfo failed, error_code: {}", error_code); throw std::runtime_error("aclrtGetMemInfo failed"); } else { spdlog::info("aclrtGetMemInfo succeeded, free_mem: {}, total: {}", free_mem, total); } uint32_t device = 0; aclrtPhysicalMemProp prop = {}; prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; prop.memAttr = ACL_HBM_MEM_HUGE; prop.location.id = device; prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; prop.reserve = 0; size_t granularity; error_code = aclrtMemGetAllocationGranularity( &prop, ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM, &granularity); if (error_code != 0) { spdlog::error("aclrtMemGetAllocationGranularity failed, error_code: {}", error_code); throw std::runtime_error("aclrtMemGetAllocationGranularity failed"); } else { spdlog::info("aclrtMemGetAllocationGranularity succeeded, granularity: {}", granularity); } if (free_mem < reserved_mem_size) { spdlog::error("Not enough free memory to reserve: {}, free_mem: {}", reserved_mem_size, free_mem); throw std::runtime_error("Not enough free memory to reserve"); } size_t g_size = free_mem - reserved_mem_size; g_size = (g_size / granularity) * granularity; // allocate physical memory aclrtDrvMemHandle mem_handle; error_code = aclrtMallocPhysical(&mem_handle, g_size, &prop, 0); if (error_code != 0) { spdlog::error("aclrtMallocPhysical failed, error_code: {}", error_code); throw std::runtime_error("aclrtMallocPhysical failed"); } else { spdlog::info("aclrtMallocPhysical succeeded, size: {}", g_size); } // // reserve address // void *vmem_addr = nullptr; // error_code = aclrtReserveMemAddress(&vmem_addr, g_size, 0, nullptr, 0); // if (error_code != 0) { // spdlog::error("aclrtReserveMemAddress failed, error_code: {}", error_code); // throw std::runtime_error("aclrtReserveMemAddress failed"); // } else { // spdlog::info("aclrtReserveMemAddress succeeded, vmem_addr: {}", vmem_addr); // } // // map // error_code = aclrtMapMem(vmem_addr, g_size, 0, mem_handle, 0); // if (error_code != 0) { // spdlog::error("aclrtMapMem failed, error_code: {}", error_code); // throw std::runtime_error("aclrtMapMem failed"); // } else { // spdlog::info("aclrtMapMem succeeded, vmem_addr: {}", vmem_addr); // } // export uint64_t shareable_handle; error_code = aclrtMemExportToShareableHandle( mem_handle, ACL_MEM_HANDLE_TYPE_NONE, ACL_RT_VMM_EXPORT_FLAG_DEFAULT, &shareable_handle); if (error_code != 0) { spdlog::error("aclrtMemExportToShareableHandle failed, error_code: {}", error_code); throw std::runtime_error("aclrtMemExportToShareableHandle failed"); } else { spdlog::info( "aclrtMemExportToShareableHandle succeeded, shareable_handle: {}", shareable_handle); } // shm shm_manager = new ShmManager(); shm_manager->set_gpu_info(g_size, shareable_handle); shm_manager->register_callback_on_worker_change( [&](const std::vector &pids) { reset_pids(pids, shareable_handle); }); // start busy loop shm_manager->run_busy_loop(); // stopped by signal delete shm_manager; shm_manager = nullptr; // free physical memory error_code = aclrtFreePhysical(mem_handle); if (error_code != 0) { spdlog::error("aclrtFreePhysical failed, error_code: {}", error_code); throw std::runtime_error("aclrtFreePhysical failed"); } } int main() { install_signal_handlers(); start_daemon(); return 0; }