#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #include "xpu/runtime.h" #define MAX_WORKERS 60 #define MAX_DEVICES 32 // static constexpr const char *SHM_NAME = "/vllm_kunlun_vxpu_offload_shm"; static inline std::string get_shm_name() { const char *env_shm_name = getenv("VLLM_VXPU_SHM_NAME"); if (env_shm_name) { if (env_shm_name[0] != '/') { spdlog::error("The shm name specified by VLLM_VXPU_SHM_NAME should start " "with '/'"); exit(-1); } return std::string(env_shm_name); } return std::string("/vllm_kunlun_vxpu_offload_shm"); } static constexpr uint32_t heartbeat_us = 1000; // microseconds static constexpr uint32_t heartbeat_timeout_us = 20 * heartbeat_us; struct alignas(64) WorkerHeartBeat { std::atomic timestamp; std::atomic worker_id; uint8_t _padding[64 - sizeof(std::atomic) - sizeof(std::atomic)]; }; static inline uint64_t heartbeat_ts_us() { return static_cast( std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count()); } static inline uint32_t unpack_lock_field(uint64_t gpu_flag) { return static_cast(gpu_flag >> 32); } static inline int32_t unpack_worker_id_field(uint64_t gpu_flag) { return static_cast(gpu_flag & 0xFFFFFFFF); } static inline uint64_t pack_locked_worker_id(int32_t worker_id) { return (static_cast(1) << 32) | static_cast(worker_id); } static inline uint64_t pack_unlocked_worker_id(int32_t worker_id) { return static_cast(worker_id); } // mmap usually page-aligned struct alignas(64) ShmHelper { // GPU lock flag std::atomic gpu_flag[MAX_DEVICES]; // uint8_t _padding1[64 - sizeof(std::atomic)]; // GPU Info uint32_t gpu_pci_addr[MAX_DEVICES]; size_t vmem_size[MAX_DEVICES]; XPUIpcMemHandle xpu_mem_handle[MAX_DEVICES]; // request enum RequestType: uint32_t { REQUEST_TYPE_REGISTER_WORKER = 1, }; /* ready: * 0: worker store: no request & worker get response * 1: worker store: worker preparing request * 2: worker store: request ready for listener * 3: listener store: listener processed request */ enum ReadyState : uint64_t { READY_STATE_NO_REQUEST = 0, READY_STATE_PREPARING_REQUEST = 1, READY_STATE_REQUEST_READY = 2, READY_STATE_REQUEST_PROCESSED = 3 }; std::atomic req_ready; // currently only allow one parameter and one response struct { uint32_t type; int32_t worker_id; uint64_t parameter; uint64_t response; } request; uint8_t _padding2[64 - sizeof(req_ready) - sizeof(request)]; // heart beats WorkerHeartBeat heart_beats[MAX_WORKERS]; void init() { for (size_t i = 0; i < MAX_DEVICES; ++i) { gpu_flag[i].store(0, std::memory_order_release); } req_ready.store(READY_STATE_NO_REQUEST, std::memory_order_release); } }; static constexpr size_t SHM_SIZE = (sizeof(ShmHelper) + 4095) & ~4095;