#pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include "spdlog/spdlog.h" #define MAX_WORKERS 60 #define MAX_DEVICES 16 static inline std::string get_shm_name() { const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME"); if (env_shm_name) { if (env_shm_name[0] != '/') { spdlog::error( "The shm name specified by VLLM_VNPU_SHM_NAME should start " "with '/'"); exit(-1); } return std::string(env_shm_name); } return std::string("/vllm_acl_vnpu_offload_shm"); } static constexpr uint32_t heartbeat_us = 1000; // microseconds static constexpr uint32_t heartbeat_check_everyN = 50; static constexpr uint32_t heartbeat_timeout_us = heartbeat_check_everyN * heartbeat_us; struct alignas(64) WorkerHeartBeat { std::atomic timestamp; std::atomic tgid; uint8_t _padding[64 - sizeof(std::atomic) - sizeof(std::atomic)]; }; static inline uint64_t heartbeat_ts_us() { return static_cast( std::chrono::duration_cast( std::chrono::steady_clock::now().time_since_epoch()) .count()); } static inline uint32_t unpack_lock_field(uint64_t gpu_flag) { return static_cast(gpu_flag >> 32); } static inline int32_t unpack_tgid_field(uint64_t gpu_flag) { return static_cast(gpu_flag & 0xFFFFFFFF); } static inline uint64_t pack_locked_tgid(int32_t tgid) { return (static_cast(1) << 32) | static_cast(tgid); } static inline uint64_t pack_unlocked_tgid(int32_t tgid) { return static_cast(tgid); } // mmap usually page-aligned struct alignas(64) ShmHelper { struct VramInfo { uint64_t total_vmem_size; uint64_t shareable_handle; }; VramInfo vram_info[MAX_DEVICES]; // support max 16 NPUs // GPU lock flag std::atomic gpu_flag[MAX_DEVICES]; // uint8_t _padding1[64 - sizeof(std::atomic)]; // request enum RequestType: uint32_t { REQUEST_TYPE_REGISTER_WORKER = 1, }; /* ready: * 0: worker store: no request & worker get response * 1: worker store: worker preparing request * 2: worker store: request ready for listener * 3: listener store: listener processed request */ enum ReadyState : uint64_t { READY_STATE_NO_REQUEST = 0, READY_STATE_PREPARING_REQUEST = 1, READY_STATE_REQUEST_READY = 2, READY_STATE_REQUEST_PROCESSED = 3 }; std::atomic req_ready; // currently only allow one parameter and one response struct { uint32_t type; int32_t tgid; uint64_t parameter; uint64_t response; } request; uint8_t _padding2[64 - sizeof(std::atomic) - sizeof(request)]; // heart beats WorkerHeartBeat heart_beats[MAX_WORKERS]; void init() { memset(vram_info, 0, sizeof(vram_info)); for (size_t i = 0; i < MAX_DEVICES; ++i) { gpu_flag[i].store(0, std::memory_order_release); } req_ready.store(READY_STATE_NO_REQUEST, std::memory_order_release); } void set_gpu_info(int gpu_id, uint64_t vmem_size, uint64_t shared_handle) { vram_info[gpu_id].total_vmem_size = vmem_size; vram_info[gpu_id].shareable_handle = shared_handle; } }; static constexpr size_t SHM_SIZE = (sizeof(ShmHelper) + 4095) & ~4095;