add vxpu

2026-02-05 19:36:06 +08:00
parent 070bfa4a73
commit e273ef01b8
131 changed files with 28539 additions and 2 deletions
--- a/vllm_kunlun/csrc/vxpu_offload/shm_helper.h
+++ b/vllm_kunlun/csrc/vxpu_offload/shm_helper.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <atomic>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <vector>
+#include <chrono>
+#include <string>
+
+#include "spdlog/spdlog.h"
+#include "xpu/runtime.h"
+
+
+#define MAX_WORKERS 60
+#define MAX_DEVICES 32
+// static constexpr const char *SHM_NAME = "/vllm_kunlun_vxpu_offload_shm";
+static inline std::string get_shm_name() {
+  const char *env_shm_name = getenv("VLLM_VXPU_SHM_NAME");
+  if (env_shm_name) {
+    if (env_shm_name[0] != '/') {
+      spdlog::error("The shm name specified by VLLM_VXPU_SHM_NAME should start "
+                    "with '/'");
+      exit(-1);
+    }
+    return std::string(env_shm_name);
+  }
+  return std::string("/vllm_kunlun_vxpu_offload_shm");
+}
+
+static constexpr uint32_t heartbeat_us = 1000; // microseconds
+static constexpr uint32_t heartbeat_timeout_us = 20 * heartbeat_us;
+
+struct alignas(64) WorkerHeartBeat {
+  std::atomic<uint64_t> timestamp;
+  std::atomic<int32_t> worker_id;
+  uint8_t _padding[64 - sizeof(std::atomic<uint64_t>) -
+                   sizeof(std::atomic<int32_t>)];
+};
+
+static inline uint64_t heartbeat_ts_us() {
+  return static_cast<uint64_t>(
+      std::chrono::duration_cast<std::chrono::microseconds>(
+          std::chrono::steady_clock::now().time_since_epoch())
+          .count());
+}
+
+static inline uint32_t unpack_lock_field(uint64_t gpu_flag) {
+  return static_cast<uint32_t>(gpu_flag >> 32);
+}
+
+static inline int32_t unpack_worker_id_field(uint64_t gpu_flag) {
+  return static_cast<int32_t>(gpu_flag & 0xFFFFFFFF);
+}
+
+static inline uint64_t pack_locked_worker_id(int32_t worker_id) {
+  return (static_cast<uint64_t>(1) << 32) | static_cast<uint64_t>(worker_id);
+}
+
+static inline uint64_t pack_unlocked_worker_id(int32_t worker_id) {
+  return static_cast<uint64_t>(worker_id);
+}
+
+// mmap usually page-aligned
+struct alignas(64) ShmHelper {
+  // GPU lock flag
+  std::atomic<uint64_t> gpu_flag[MAX_DEVICES];
+  // uint8_t _padding1[64 - sizeof(std::atomic<uint64_t>)];
+
+  // GPU Info
+  uint32_t gpu_pci_addr[MAX_DEVICES];
+  size_t vmem_size[MAX_DEVICES];
+  XPUIpcMemHandle xpu_mem_handle[MAX_DEVICES];
+
+  // request
+  enum RequestType: uint32_t {
+    REQUEST_TYPE_REGISTER_WORKER = 1,
+  };
+  /* ready:
+    * 0: worker store: no request & worker get response
+    * 1: worker store: worker preparing request
+    * 2: worker store: request ready for listener
+    * 3: listener store: listener processed request
+  */
+  enum ReadyState : uint64_t {
+    READY_STATE_NO_REQUEST = 0,
+    READY_STATE_PREPARING_REQUEST = 1,
+    READY_STATE_REQUEST_READY = 2,
+    READY_STATE_REQUEST_PROCESSED = 3
+  };
+  std::atomic<uint64_t> req_ready;
+  // currently only allow one parameter and one response
+  struct {
+    uint32_t type;
+    int32_t worker_id;
+    uint64_t parameter;
+    uint64_t response;
+  } request;
+  uint8_t _padding2[64 - sizeof(req_ready) - sizeof(request)];
+
+  // heart beats
+  WorkerHeartBeat heart_beats[MAX_WORKERS];
+
+  void init() {
+    for (size_t i = 0; i < MAX_DEVICES; ++i) {
+      gpu_flag[i].store(0, std::memory_order_release);
+    }
+    req_ready.store(READY_STATE_NO_REQUEST, std::memory_order_release);
+  }
+};
+
+static constexpr size_t SHM_SIZE = (sizeof(ShmHelper) + 4095) & ~4095;