Files
xc-llm-ascend/csrc/vnpu_offload/shm_helper.h
Jing Wang b6549b6e38 Add feature: priority
Signed-off-by: Jing Wang <jingwang96@qq.com>
2026-05-13 06:16:25 +00:00

155 lines
4.5 KiB
C++

#pragma once
#include <atomic>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <vector>
#include <chrono>
#include <string>
#include "spdlog/spdlog.h"
#define MAX_WORKERS 64
#define MAX_DEVICES 32
static inline std::string get_shm_name() {
const char *env_shm_name = getenv("VLLM_VNPU_SHM_NAME");
if (env_shm_name) {
if (env_shm_name[0] != '/') {
spdlog::error(
"The shm name specified by VLLM_VNPU_SHM_NAME should start "
"with '/'");
exit(-1);
}
return std::string(env_shm_name);
}
return std::string("/vllm_acl_vnpu_offload_shm");
}
static constexpr uint32_t heartbeat_us = 1000; // microseconds
static constexpr uint32_t heartbeat_check_everyN = 100;
static constexpr uint32_t heartbeat_timeout_us =
heartbeat_check_everyN * heartbeat_us;
struct alignas(64) WorkerHeartBeat {
std::atomic<uint64_t> timestamp;
std::atomic<int32_t> tgid;
uint8_t _padding[64 - sizeof(std::atomic<uint64_t>) -
sizeof(std::atomic<int32_t>)];
};
static inline uint64_t heartbeat_ts_us() {
return static_cast<uint64_t>(
std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count());
}
// GPU flag layout (64 bits):
// [lock (1 bit) | reserved (31 bits) | tgid (32 bits)]
static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
return static_cast<uint32_t>(gpu_flag >> 32);
}
static inline int32_t unpack_tgid_field(uint64_t gpu_flag) {
return static_cast<int32_t>(gpu_flag & 0xFFFFFFFF);
}
static inline uint64_t pack_locked_tgid(int32_t tgid) {
return (static_cast<uint64_t>(1) << 32) | static_cast<uint64_t>(tgid);
}
static inline uint64_t pack_unlocked_tgid(int32_t tgid) {
return static_cast<uint64_t>(tgid);
}
// waiting_worker_flag layout (64 bits):
// [ device_id (5 bits) | priority (3 bits) | timestamp (24 bits) | tgid (32 bits)]
static inline uint32_t unpack_waiting_device_id(uint64_t flag) {
return static_cast<uint32_t>(flag >> 59);
}
static inline uint16_t unpack_waiting_priority(uint64_t flag) {
return static_cast<uint16_t>((flag >> 56) & 0x7);
}
static inline uint32_t unpack_waiting_timestamp_ms(uint64_t flag) {
return static_cast<uint32_t>((flag >> 32) & 0xFFFFFF);
}
static inline int32_t unpack_waiting_tgid(uint64_t flag) {
return static_cast<int32_t>(flag & 0xFFFFFFFF);
}
static inline uint64_t pack_waiting_flag(uint32_t device_id, uint16_t priority,
uint32_t timestamp, int32_t tgid) {
return (static_cast<uint64_t>(device_id & 0x1F) << 59) |
(static_cast<uint64_t>(priority & 0x7) << 56) |
(static_cast<uint64_t>(timestamp & 0xFFFFFF) << 32) |
(static_cast<uint64_t>(tgid) & 0xFFFFFFFF);
}
// mmap usually page-aligned
struct alignas(64) ShmHelper {
struct VramInfo {
uint64_t total_vmem_size;
uint64_t shareable_handle;
};
VramInfo vram_info[MAX_DEVICES]; // support max 32 devices
// GPU lock flag
std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
std::atomic<uint64_t> waiting_worker_flags[MAX_WORKERS];
// request
enum RequestType: uint32_t {
REQUEST_TYPE_REGISTER_WORKER = 1,
};
/* ready:
* 0: worker store: no request & worker get response
* 1: worker store: worker preparing request
* 2: worker store: request ready for listener
* 3: listener store: listener processed request
*/
enum ReadyState : uint64_t {
READY_STATE_NO_REQUEST = 0,
READY_STATE_PREPARING_REQUEST = 1,
READY_STATE_REQUEST_READY = 2,
READY_STATE_REQUEST_PROCESSED = 3
};
std::atomic<uint64_t> req_ready;
// currently only allow one parameter and one response
struct {
uint32_t type;
int32_t tgid;
uint64_t parameter;
uint64_t response;
} request;
uint8_t _padding2[64 - sizeof(std::atomic<uint64_t>) - sizeof(request)];
// heart beats
WorkerHeartBeat heart_beats[MAX_WORKERS];
void init() {
memset(vram_info, 0, sizeof(vram_info));
for (size_t i = 0; i < MAX_DEVICES; ++i) {
gpu_flag[i].store(0, std::memory_order_release);
}
req_ready.store(READY_STATE_NO_REQUEST, std::memory_order_release);
}
void set_gpu_info(int gpu_id, uint64_t vmem_size, uint64_t shared_handle) {
vram_info[gpu_id].total_vmem_size = vmem_size;
vram_info[gpu_id].shareable_handle = shared_handle;
}
};
static constexpr size_t SHM_SIZE = (sizeof(ShmHelper) + 4095) & ~4095;