adapt to vllm-ascend v0.18.0rc1

2026-04-21 03:05:32 +00:00
parent 99e1ea0fe6
commit e4d898b245
132 changed files with 28743 additions and 100 deletions
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -265,7 +265,10 @@ class NPUWorker(WorkerBase):
        # take current memory snapshot
        self.init_snapshot = MemorySnapshot()
        self.requested_memory = self.init_snapshot.total_memory * self.cache_config.gpu_memory_utilization
-        if self.init_snapshot.free_memory < self.requested_memory:
+        if (
+            self.init_snapshot.free_memory < self.requested_memory
+            and not envs_ascend.VLLM_ASCEND_ENABLE_VNPU
+        ):
            GiB = lambda b: round(b / GiB_bytes, 2)
            raise ValueError(
                f"Free memory on device "
@@ -360,6 +363,28 @@ class NPUWorker(WorkerBase):

        return int(self.available_kv_cache_memory_bytes)

+    @torch.inference_mode()
+    def determine_available_memory_vnpu_offload_mode(self) -> int:
+        GiB = lambda b: b / GiB_bytes
+        allocator = CaMemAllocator.get_instance()
+        free, total = allocator.get_pool_mem_info()
+        if self.cache_config.gpu_memory_utilization <= 0.9:
+            logger.warning(
+                "GPU memory utilization is set to %.2f. For VNPU mode, it is recommended to set gpu_memory_utilization to a larger value",
+                self.cache_config.gpu_memory_utilization,
+            )
+        available_kv_cache_memory = int(
+            total * self.cache_config.gpu_memory_utilization - (total - free)
+        )
+        available_kv_cache_memory = int(max(available_kv_cache_memory, 0))
+        self.available_kv_cache_memory_bytes = available_kv_cache_memory
+        logger.info_once(
+            "Available KV cache memory: %.2f GiB",
+            GiB(self.available_kv_cache_memory_bytes),
+            scope="local",
+        )
+        return int(self.available_kv_cache_memory_bytes)
+
    def execute_model(
        self,
        scheduler_output: "SchedulerOutput",
@@ -431,6 +456,12 @@ class NPUWorker(WorkerBase):
            allocator = CaMemAllocator.get_instance()
            assert allocator.get_current_usage() == 0, "Sleep mode can only be used for one instance per process."
            context = allocator.use_memory_pool(tag="weights")
+        elif envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+            allocator = CaMemAllocator.get_instance()
+            assert (
+                allocator.get_current_usage() == 0
+            ), "vNPU mode can only be used for one instance per process."
+            context = allocator.use_memory_pool(tag="weights")
        else:
            from contextlib import nullcontext

@@ -438,6 +469,23 @@ class NPUWorker(WorkerBase):

        with context, set_current_vllm_config(self.vllm_config):
            self.model_runner.load_model()
+            if envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+                # save memory to host with lock
+                self.offload_vram()
+                succ, _ = self.try_reload_vram()
+                assert succ, "Failed to reload model weights after offloading."
+
+    def offload_vram(self) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.offload_vram(offload_tags=("weights",))
+
+    def try_reload_vram(self) -> tuple[bool, bool]:
+        allocator = CaMemAllocator.get_instance()
+        return allocator.try_reload_vram(tags=None)
+
+    def vnpu_unlock_gpu(self) -> None:
+        allocator = CaMemAllocator.get_instance()
+        allocator.vnpu_unlock_gpu()

    def compile_or_warm_up_model(self) -> float:
        # Note: need to adapt for graph mode.
@@ -517,6 +565,9 @@ class NPUWorker(WorkerBase):
        if self.vllm_config.model_config.enable_sleep_mode:
            allocator = CaMemAllocator.get_instance()
            context = allocator.use_memory_pool(tag="kv_cache")
+        elif envs_ascend.VLLM_ASCEND_ENABLE_VNPU:
+            allocator = CaMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
        else:
            from contextlib import nullcontext