Add feature: priority

Signed-off-by: Jing Wang <jingwang96@qq.com>
2026-05-12 11:51:57 +00:00
parent d627a45881
commit b6549b6e38
11 changed files with 382 additions and 66 deletions
--- a/csrc/vnpu_offload/shm_worker.cpp
+++ b/csrc/vnpu_offload/shm_worker.cpp
@@ -1,7 +1,29 @@
 #include "shm_worker.h"

+static inline uint16_t get_shm_priority() {
+  const char *env_priority = getenv("VLLM_VNPU_PRIORITY");
+  if (env_priority) {
+    try {
+      int p = std::stoi(env_priority);
+      if (p >= 0 && p <= 7) {
+        return static_cast<uint16_t>(p);
+      } else {
+        spdlog::warn("VLLM_VNPU_PRIORITY should be between 0 and 7, got {}. Using default 0.", p);
+      }
+    } catch (...) {
+      spdlog::warn("Invalid VLLM_VNPU_PRIORITY format. Using default 0.");
+    }
+  }
+  return 0;
+}
+

 ShmWorker::ShmWorker() {
+  this->priority = get_shm_priority();
+  this->waiting_timestamp = 0;
+  this->is_waiting = false;
+  this->is_holding_lock = false;
+  spdlog::info("vNPU worker initialized with priority {}", priority);
  std::string shm_name = get_shm_name();
  int shm_fd = shm_open(shm_name.c_str(), O_RDWR, 0666);
  if (shm_fd == -1) {
@@ -40,16 +62,18 @@ bool ShmWorker::register_worker(int32_t tgid, int gpu_id,
  if (slot == -1) {
    return false;
  }
+  this->shm_slot = slot;

  *out_shareable_handle = shm_helper->vram_info[gpu_id].shareable_handle;
  *out_vmem_size = shm_helper->vram_info[gpu_id].total_vmem_size;

  stop_heart_beat.store(false, std::memory_order_release);
-  heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this, slot);
+  heart_beat_thread = std::thread(&ShmWorker::heart_beat_loop, this);
  return true;
 }

-void ShmWorker::heart_beat_loop(int slot) {
+void ShmWorker::heart_beat_loop() {
+  int slot = this->shm_slot;
  while (!stop_heart_beat.load(std::memory_order_acquire)) {
    // update heart beat
    int32_t shm_tgid =
@@ -64,6 +88,7 @@ void ShmWorker::heart_beat_loop(int slot) {
        spdlog::error("TGID {} failed to re-register as worker", tgid);
        throw std::runtime_error("Failed to re-register as worker");
      }
+      this->shm_slot = slot;
    }
    uint64_t now = heartbeat_ts_us();
    shm_helper->heart_beats[slot].timestamp.store(now,
@@ -72,32 +97,95 @@ void ShmWorker::heart_beat_loop(int slot) {
  }
 }

+void ShmWorker::start_wait() {
+  if (is_waiting) return;  // Keep the older timestamp if already waiting
+
+  // Use lower 24 bits of millisecond timestamp
+  waiting_timestamp = static_cast<uint32_t>((heartbeat_ts_us() / 1000) & 0xFFFFFF);
+
+  uint64_t flag = pack_waiting_flag(this->gpu_id, this->priority, waiting_timestamp, this->tgid);
+  shm_helper->waiting_worker_flags[this->shm_slot].store(flag, std::memory_order_release);
+  is_waiting = true;
+}
+
+void ShmWorker::cancel_wait() {
+  if (!is_waiting) return;
+
+  shm_helper->waiting_worker_flags[this->shm_slot].store(0, std::memory_order_release);
+  is_waiting = false;
+}
+
+bool ShmWorker::has_higher_priority_waiter() {
+  for (int i = 0; i < MAX_WORKERS; ++i) {
+    if (i == this->shm_slot) continue;
+
+    uint64_t flag = shm_helper->waiting_worker_flags[i].load(std::memory_order_acquire);
+    if (flag == 0) continue;
+    if (unpack_waiting_device_id(flag) != this->gpu_id) continue;
+
+    uint16_t other_prio = unpack_waiting_priority(flag);
+
+    if (other_prio > this->priority) {
+      return true;  // Found a waiter with higher priority
+    } else if (other_prio == this->priority) {
+      if (this->is_holding_lock) {
+        // doesn't need to yield to same priority waiters
+        continue;
+      }
+      if (!this->is_waiting) {
+        // an earlier waiter with the same priority
+        return true;
+      }
+      uint32_t other_ts = unpack_waiting_timestamp_ms(flag);
+      // Same priority, compare timestamps (handle 24-bit wrap-around)
+      // Using 24-bit unsigned subtraction. If the difference is in the lower half,
+      // my timestamp is greater (i.e., I started waiting later).
+      uint32_t diff = (this->waiting_timestamp - other_ts) & 0xFFFFFF;
+      if (diff > 0 && diff < 0x800000) {
+        return true;  // The other worker started waiting earlier
+      } else if (diff == 0 && unpack_waiting_tgid(flag) < this->tgid) {
+        // using tgid if timestamps happen to be exactly the same
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool ShmWorker::try_lock_gpu(bool &out_self_hold) {
  static int retry_cnt = 0;

  uint64_t old_flag =
      shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
  if (unpack_lock_field(old_flag) == 0) { // free
+    // Check priority: yield if there are higher priority waiters, or same priority waiters who have waited longer.
+    if (has_higher_priority_waiter()) {
+      out_self_hold = false;
+      return false;
+    }
+
    uint64_t new_flag = pack_locked_tgid(tgid);
    if (shm_helper->gpu_flag[gpu_id].compare_exchange_weak(
            old_flag, new_flag, std::memory_order_acq_rel,
            std::memory_order_acquire)) {
-      spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
+      // spdlog::info("TGID {} acquired GPU {} lock", tgid, gpu_id);
      int32_t prev_tgid = unpack_tgid_field(old_flag);
      out_self_hold = prev_tgid == tgid;
      retry_cnt = 0;
+      this->is_holding_lock = true;
      return true;
    }
  } else { // locked
    if (unpack_tgid_field(old_flag) == tgid) {
-      spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
+      // spdlog::info("TGID {} already holds the GPU {} lock", tgid, gpu_id);
      out_self_hold = true;
      retry_cnt = 0;
+      this->is_holding_lock = true;
      return true;
    }
  }
  // failed
-  if (++retry_cnt % 2000 == 0) {
+  if (++retry_cnt % 10000 == 0) {
    spdlog::info(
        "TGID {} trying to acquire GPU {} lock, current lock holder TGID {}",
        tgid, gpu_id, unpack_tgid_field(old_flag));
@@ -116,19 +204,23 @@ bool ShmWorker::lock_gpu(bool &out_self_hold) {
  }
 }

-void ShmWorker::unlock_gpu() {
+void ShmWorker::unlock_gpu(bool keep_wait) {
+  if (!keep_wait) {
+    cancel_wait();
+  }
+
  uint64_t old_flag =
      shm_helper->gpu_flag[gpu_id].load(std::memory_order_acquire);
  if (unpack_tgid_field(old_flag) != tgid) {
-    // spdlog::warn("previous gpu flag {} does not match expected locked flag for "
-    //              "TGID {}. This may be a bug, unless during startup.",
-    //              old_flag, tgid);
-    spdlog::info("TGID {} does not hold GPU {} lock", tgid, gpu_id);
+    if (!keep_wait) {
+      spdlog::info("unlock: TGID {} does not hold GPU {} lock", tgid, gpu_id);
+    }
  } else {
    uint64_t new_flag = pack_unlocked_tgid(tgid);
    shm_helper->gpu_flag[gpu_id].store(new_flag, std::memory_order_release);
-    spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
+    // spdlog::info("TGID {} released GPU {} lock", tgid, gpu_id);
  }
+  this->is_holding_lock = false;
 }

 uint64_t ShmWorker::make_request(uint32_t type, uint64_t parameter) {