add vxpu
This commit is contained in:
117
vllm_kunlun/csrc/vxpu_offload/shm_helper.h
Normal file
117
vllm_kunlun/csrc/vxpu_offload/shm_helper.h
Normal file
@@ -0,0 +1,117 @@
|
||||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <string>
|
||||
|
||||
#include "spdlog/spdlog.h"
|
||||
#include "xpu/runtime.h"
|
||||
|
||||
|
||||
#define MAX_WORKERS 60
|
||||
#define MAX_DEVICES 32
|
||||
// static constexpr const char *SHM_NAME = "/vllm_kunlun_vxpu_offload_shm";
|
||||
static inline std::string get_shm_name() {
|
||||
const char *env_shm_name = getenv("VLLM_VXPU_SHM_NAME");
|
||||
if (env_shm_name) {
|
||||
if (env_shm_name[0] != '/') {
|
||||
spdlog::error("The shm name specified by VLLM_VXPU_SHM_NAME should start "
|
||||
"with '/'");
|
||||
exit(-1);
|
||||
}
|
||||
return std::string(env_shm_name);
|
||||
}
|
||||
return std::string("/vllm_kunlun_vxpu_offload_shm");
|
||||
}
|
||||
|
||||
static constexpr uint32_t heartbeat_us = 1000; // microseconds
|
||||
static constexpr uint32_t heartbeat_timeout_us = 20 * heartbeat_us;
|
||||
|
||||
struct alignas(64) WorkerHeartBeat {
|
||||
std::atomic<uint64_t> timestamp;
|
||||
std::atomic<int32_t> worker_id;
|
||||
uint8_t _padding[64 - sizeof(std::atomic<uint64_t>) -
|
||||
sizeof(std::atomic<int32_t>)];
|
||||
};
|
||||
|
||||
static inline uint64_t heartbeat_ts_us() {
|
||||
return static_cast<uint64_t>(
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
std::chrono::steady_clock::now().time_since_epoch())
|
||||
.count());
|
||||
}
|
||||
|
||||
static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
|
||||
return static_cast<uint32_t>(gpu_flag >> 32);
|
||||
}
|
||||
|
||||
static inline int32_t unpack_worker_id_field(uint64_t gpu_flag) {
|
||||
return static_cast<int32_t>(gpu_flag & 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
static inline uint64_t pack_locked_worker_id(int32_t worker_id) {
|
||||
return (static_cast<uint64_t>(1) << 32) | static_cast<uint64_t>(worker_id);
|
||||
}
|
||||
|
||||
static inline uint64_t pack_unlocked_worker_id(int32_t worker_id) {
|
||||
return static_cast<uint64_t>(worker_id);
|
||||
}
|
||||
|
||||
// mmap usually page-aligned
|
||||
struct alignas(64) ShmHelper {
|
||||
// GPU lock flag
|
||||
std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
|
||||
// uint8_t _padding1[64 - sizeof(std::atomic<uint64_t>)];
|
||||
|
||||
// GPU Info
|
||||
uint32_t gpu_pci_addr[MAX_DEVICES];
|
||||
size_t vmem_size[MAX_DEVICES];
|
||||
XPUIpcMemHandle xpu_mem_handle[MAX_DEVICES];
|
||||
|
||||
// request
|
||||
enum RequestType: uint32_t {
|
||||
REQUEST_TYPE_REGISTER_WORKER = 1,
|
||||
};
|
||||
/* ready:
|
||||
* 0: worker store: no request & worker get response
|
||||
* 1: worker store: worker preparing request
|
||||
* 2: worker store: request ready for listener
|
||||
* 3: listener store: listener processed request
|
||||
*/
|
||||
enum ReadyState : uint64_t {
|
||||
READY_STATE_NO_REQUEST = 0,
|
||||
READY_STATE_PREPARING_REQUEST = 1,
|
||||
READY_STATE_REQUEST_READY = 2,
|
||||
READY_STATE_REQUEST_PROCESSED = 3
|
||||
};
|
||||
std::atomic<uint64_t> req_ready;
|
||||
// currently only allow one parameter and one response
|
||||
struct {
|
||||
uint32_t type;
|
||||
int32_t worker_id;
|
||||
uint64_t parameter;
|
||||
uint64_t response;
|
||||
} request;
|
||||
uint8_t _padding2[64 - sizeof(req_ready) - sizeof(request)];
|
||||
|
||||
// heart beats
|
||||
WorkerHeartBeat heart_beats[MAX_WORKERS];
|
||||
|
||||
void init() {
|
||||
for (size_t i = 0; i < MAX_DEVICES; ++i) {
|
||||
gpu_flag[i].store(0, std::memory_order_release);
|
||||
}
|
||||
req_ready.store(READY_STATE_NO_REQUEST, std::memory_order_release);
|
||||
}
|
||||
};
|
||||
|
||||
static constexpr size_t SHM_SIZE = (sizeof(ShmHelper) + 4095) & ~4095;
|
||||
Reference in New Issue
Block a user