Add feature: priority

Signed-off-by: Jing Wang <jingwang96@qq.com>
2026-05-12 11:51:57 +00:00
parent d627a45881
commit b6549b6e38
11 changed files with 382 additions and 66 deletions
--- a/vllm_ascend/device_allocator/camem.py
+++ b/vllm_ascend/device_allocator/camem.py
@@ -62,7 +62,10 @@ try:
            python_create_and_map_offload as python_create_and_map,python_unmap_and_release_offload as python_unmap_and_release,
            python_get_mem_info_offload as python_get_mem_info,
            python_try_lock_gpu_offload as python_try_lock_gpu,
-            python_unlock_gpu_offload as python_unlock_gpu
+            python_unlock_gpu_offload as python_unlock_gpu,
+            python_start_wait_offload as python_start_wait,
+            python_cancel_wait_offload as python_cancel_wait,
+            python_has_higher_priority_waiter_offload as python_has_higher_priority_waiter
        )
    else:
        from vllm_ascend.vllm_ascend_C import (  # type: ignore # noqa: F401
@@ -73,6 +76,9 @@ try:
        python_get_mem_info = None
        python_try_lock_gpu = None
        python_unlock_gpu = None
+        python_start_wait = None
+        python_cancel_wait = None
+        python_has_higher_priority_waiter = None

    lib_name = find_loaded_library("vllm_ascend_C")
    camem_available = True
@@ -84,6 +90,9 @@ except ImportError as e:
    python_get_mem_info = None
    python_try_lock_gpu = None
    python_unlock_gpu = None
+    python_start_wait = None
+    python_cancel_wait = None
+    python_has_higher_priority_waiter = None
    lib_name = None
    libcudart = None

@@ -306,15 +315,37 @@ class CaMemAllocator:
            return False, False

    def _vnpu_lock_gpu(self) -> bool:
+        is_waiting = False
        while True:
            success, _ = self.vnpu_try_lock_gpu()
            if success:
+                if is_waiting:
+                    self.vnpu_cancel_wait()
                return True
+            else:
+                if not is_waiting:
+                    self.vnpu_start_wait()
+                    is_waiting = True
+                self.vnpu_unlock_gpu(keep_wait=True)
+
            time.sleep(0.001)

-    def vnpu_unlock_gpu(self):
+    def vnpu_unlock_gpu(self, keep_wait: bool = False):
        if python_unlock_gpu:
-            python_unlock_gpu()
+            python_unlock_gpu(keep_wait)
+
+    def vnpu_start_wait(self) -> None:
+        if python_start_wait:
+            python_start_wait()
+
+    def vnpu_cancel_wait(self) -> None:
+        if python_cancel_wait:
+            python_cancel_wait()
+
+    def vnpu_has_higher_priority_waiter(self) -> bool:
+        if python_has_higher_priority_waiter:
+            return python_has_higher_priority_waiter()
+        return False

    def get_pool_mem_info(self) -> tuple[int, int]:
        """
--- a/vllm_ascend/patch/platform/patch_core.py
+++ b/vllm_ascend/patch/platform/patch_core.py
@@ -1,6 +1,8 @@
+from concurrent.futures import Future
 from logging import DEBUG
-import signal
 import queue
+import signal
+import time

 from vllm.config import ParallelConfig, VllmConfig
 from vllm.logger import logger
@@ -98,8 +100,13 @@ def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
        if engine_core is not None:
            engine_core.shutdown()

+
 def run_busy_loop(self):
    """Core busy loop of the EngineCore."""
+    # vnpu yield
+    yield_probe_counter = 0
+    prepared_yield = False
+
    while self._handle_shutdown():
        # 1) Poll the input queue until there is work to do.
        self._process_input_queue()
@@ -111,14 +118,52 @@ def run_busy_loop(self):
            prev_is_self = self.model_executor.reload_vram()
            if not prev_is_self:
                self.reset_prefix_cache()
+
        # 2) Step the engine core and return the outputs.
        self._process_engine_step()
-        if (
-            envs_ascend.VLLM_ASCEND_ENABLE_VNPU
-            and not self.has_work()
-            and not self.model_executor.is_offloaded()
-        ):
-            self.model_executor.offload_vram()
+
+        if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+            if not self.has_work():
+                if not self.model_executor.is_offloaded():
+                    self.model_executor.offload_vram()
+            elif not prepared_yield:
+                # check should yield every 10 steps
+                yield_probe_counter = (yield_probe_counter + 1) % 10
+                if yield_probe_counter == 0:
+                    should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
+                    if should_yield:
+                        logger.info(
+                            "Found other higher priority worker. Current engine will yield after finishing in-flight requests."
+                        )
+                        prepared_yield = True
+                        pause_future = self.pause_scheduler(
+                            mode="wait", clear_cache=True
+                        )
+
+                        def pause_complete(f: Future):
+                            nonlocal prepared_yield
+                            try:
+                                if f:
+                                    f.result()
+                                if not self.model_executor.is_offloaded():
+                                    self.model_executor.offload_vram(is_yield=True)
+                                prepared_yield = False
+                                logger.info("Current engine has yielded.")
+                                # Scheduler should wake up itself after yielding.
+                                # Sleep some time to give chance to other worker.
+                                time.sleep(2)
+                                self.resume_scheduler()
+                            except Exception as e:
+                                logger.exception("Failed to yield: {e}.")
+                                raise e
+
+                        if pause_future is None:
+                            # pause finished, no in-flight requests
+                            pause_complete(None)
+                        else:
+                            # pause_future will be set after all in-flight
+                            # requests are finished in _process_input_queue
+                            pause_future.add_done_callback(pause_complete)

    raise SystemExit

@@ -142,7 +187,8 @@ def _process_input_queue(self):
                and not self.model_executor.is_offloaded()
            ):
                self.model_executor.offload_vram()
-        block = self.process_input_queue_block
+        # vNPU: if scheduler is resumed and has work, should not block
+        block = self.process_input_queue_block and not self.has_work()
        try:
            req = self.input_queue.get(block=block)
            self._handle_client_request(*req)
@@ -169,6 +215,9 @@ EngineCoreProc._process_input_queue = _process_input_queue

 def DPEngineCoreProc_run_busy_loop(self):
    """Core busy loop of the EngineCore for data parallel case."""
+    # vnpu yield
+    yield_probe_counter = 0
+    prepared_yield = False

    # Loop until process is sent a SIGINT or SIGTERM
    while self._handle_shutdown():
@@ -226,13 +275,49 @@ def DPEngineCoreProc_run_busy_loop(self):
            # Increment wave count and reset step counter.
            self.current_wave += 1
            self.step_counter = 0
-    
-        if (
-            envs_ascend.VLLM_ASCEND_ENABLE_VNPU
-            and not self.has_work()
-            and not self.model_executor.is_offloaded()
-        ):
-            self.model_executor.offload_vram()
+
+        if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+            if not self.has_work():
+                if not self.model_executor.is_offloaded():
+                    self.model_executor.offload_vram()
+            elif not prepared_yield:
+                # check should yield every 10 steps
+                yield_probe_counter = (yield_probe_counter + 1) % 10
+                if yield_probe_counter == 0:
+                    should_yield = self.model_executor.vnpu_has_higher_priority_waiter()
+                    if should_yield:
+                        logger.info(
+                            "Found other higher priority worker. Current engine will yield after finishing in-flight requests."
+                        )
+                        prepared_yield = True
+                        pause_future = self.pause_scheduler(
+                            mode="wait", clear_cache=True
+                        )
+
+                        def pause_complete(f: Future):
+                            nonlocal prepared_yield
+                            try:
+                                if f:
+                                    f.result()
+                                if not self.model_executor.is_offloaded():
+                                    self.model_executor.offload_vram(is_yield=True)
+                                prepared_yield = False
+                                logger.info("Current engine has yielded.")
+                                # Scheduler should wake up itself after yielding.
+                                # Sleep some time to give chance to other worker.
+                                time.sleep(2)
+                                self.resume_scheduler()
+                            except Exception as e:
+                                logger.exception("Failed to yield: {e}.")
+                                raise e
+
+                        if pause_future is None:
+                            # pause finished, no in-flight requests
+                            pause_complete(None)
+                        else:
+                            # pause_future will be set after all in-flight
+                            # requests are finished in _process_input_queue
+                            pause_future.add_done_callback(pause_complete)

    raise SystemExit

--- a/vllm_ascend/patch/platform/patch_executor.py
+++ b/vllm_ascend/patch/platform/patch_executor.py
@@ -8,7 +8,12 @@ def is_offloaded(self) -> bool:
        self._is_offloaded = False
    return self._is_offloaded

-def offload_vram(self):
+def is_yielded(self) -> bool:
+    if not hasattr(self, "_is_yielded"):
+        self._is_yielded = False
+    return self._is_yielded
+
+def offload_vram(self, is_yield: bool = False):
    if self.is_offloaded():
        logger.warning("Executor is already offloaded.")
        return
@@ -17,14 +22,18 @@ def offload_vram(self):
    time_after_offload = time.perf_counter()

    self._is_offloaded = True
-    logger.info(f"Offloading VRAM costs {time_after_offload - time_before_offload:.6f} seconds.")
-
+    if is_yield:
+        self._is_yielded = True
+    logger.info(
+        f"Offloading VRAM costs {time_after_offload - time_before_offload:.3f} seconds."
+    )

 def reload_vram(self) -> bool:
    if not self.is_offloaded():
        logger.warning("Executor is not offloaded.")
        return True

+    is_waiting = False
    while True:
        time_before_reload = time.perf_counter()
        res = self.collective_rpc("try_reload_vram")
@@ -33,15 +42,28 @@ def reload_vram(self) -> bool:
        succ = all(x[0] for x in res)
        if succ:
            self._is_offloaded = False
-            logger.info(f"Reloading VRAM costs {time_after_reload - time_before_reload:.6f} seconds.")
+            self._is_yielded = False
            prev_is_self = all(x[1] for x in res)
+            if is_waiting:
+                self.collective_rpc("vnpu_cancel_wait")
+            logger.info(
+                f"Reloading VRAM costs {time_after_reload - time_before_reload:.3f} seconds."
+            )
            return prev_is_self
        else:
            # some workers not get lock
-            self.collective_rpc("vnpu_unlock_gpu")
+            if not is_waiting:
+                self.collective_rpc("vnpu_start_wait")
+                is_waiting = True
+            self.collective_rpc("vnpu_unlock_gpu", kwargs={"keep_wait": True})
            time.sleep(0.001)

+def vnpu_has_higher_priority_waiter(self) -> bool:
+    res = self.collective_rpc("vnpu_has_higher_priority_waiter")
+    return any(res)
+

 Executor.is_offloaded = is_offloaded
 Executor.offload_vram = offload_vram
 Executor.reload_vram = reload_vram
+Executor.vnpu_has_higher_priority_waiter = vnpu_has_higher_priority_waiter
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -470,7 +470,7 @@ class NPUWorker(WorkerBase):
                # save memory to host with lock
                self.offload_vram()
                succ, _ = self.try_reload_vram()
-                assert succ, "Failed to reload model weights after offloading."
+                # assert succ, "Failed to reload model weights after offloading."

    def offload_vram(self) -> None:
        allocator = CaMemAllocator.get_instance()
@@ -480,9 +480,21 @@ class NPUWorker(WorkerBase):
        allocator = CaMemAllocator.get_instance()
        return allocator.try_reload_vram(tags=None)

-    def vnpu_unlock_gpu(self) -> None:
+    def vnpu_unlock_gpu(self, keep_wait: bool = False) -> None:
        allocator = CaMemAllocator.get_instance()
-        allocator.vnpu_unlock_gpu()
+        allocator.vnpu_unlock_gpu(keep_wait)
+
+    def vnpu_start_wait(self) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.vnpu_start_wait()
+
+    def vnpu_cancel_wait(self) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.vnpu_cancel_wait()
+
+    def vnpu_has_higher_priority_waiter(self) -> bool:
+        allocator = CaMemAllocator.get_instance()
+        return allocator.vnpu_has_higher_priority_waiter()

    def compile_or_warm_up_model(self) -> float:
        # Note: need to adapt for graph mode.