support multi npu partially

2026-01-08 06:54:33 +00:00
parent fa0fb46853
commit 2a571d8bc8
12 changed files with 331 additions and 160 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -339,12 +339,13 @@ class NPUWorker(WorkerBase):
            if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
                # save memory to host with lock
                self.offload_vram()
-                self.reload_vram()
+                succ, _ = self.try_reload_vram()
+                assert succ, "Failed to reload model weights after offloading."

    def offload_vram(self) -> None:
        # free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
        allocator = CaMemAllocator.get_instance()
-        allocator.offload_vram(offload_tags=("weights", ))
+        allocator.offload_vram(offload_tags=("weights",))
        # free_bytes_after_offload, total = NPUPlatform.mem_get_info()
        # freed_bytes = free_bytes_after_offload - free_bytes_before_offload
        # used_bytes = total - free_bytes_after_offload
@@ -354,9 +355,13 @@ class NPUWorker(WorkerBase):
        #     "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
        #     used_bytes / GiB_bytes)

-    def reload_vram(self) -> bool:
+    def try_reload_vram(self) -> tuple[bool, bool]:
        allocator = CaMemAllocator.get_instance()
-        return allocator.reload_vram(tags=None)
+        return allocator.try_reload_vram(tags=None)
+    
+    def vnpu_unlock_gpu(self) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.vnpu_unlock_gpu()

    def compile_or_warm_up_model(self) -> None:
        # Note: need to adapt for graph mode.