vllm-ascend vnpu v1

2025-12-26 07:37:35 +00:00
parent 2f1aed98cc
commit 135cc0a505
168 changed files with 28337 additions and 9 deletions
--- a/vllm_ascend/worker/worker_v1.py
+++ b/vllm_ascend/worker/worker_v1.py
@@ -258,6 +258,17 @@ class NPUWorker(WorkerBase):
        )
        return available_kv_cache_memory

+    def determine_available_memory_idle_offload_mode(self) -> int:
+        allocator = CaMemAllocator.get_instance()
+        free, total = allocator.get_pool_mem_info()
+        available_kv_cache_memory = int(
+            total * self.cache_config.gpu_memory_utilization - (total - free))
+        available_kv_cache_memory = int(max(available_kv_cache_memory, 0))
+        logger.info(
+            f"Available memory (idle offload mode): {available_kv_cache_memory}, total memory: {total}"
+        )
+        return available_kv_cache_memory
+
    def execute_model(
        self,
        scheduler_output: "SchedulerOutput",
@@ -306,11 +317,46 @@ class NPUWorker(WorkerBase):
                "Sleep mode can only be "
                "used for one instance per process.")
            context = allocator.use_memory_pool(tag="weights")
+        elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
+            if not sleep_mode_enabled():
+                raise ValueError(
+                    "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
+                )
+            if is_enable_nz():
+                raise ValueError(
+                    "FRACTAL_NZ mode is enabled. This may cause model parameter precision issues "
+                    "in the RL scenarios. Please set VLLM_ASCEND_ENABLE_NZ=0.")
+            allocator = CaMemAllocator.get_instance()
+            assert allocator.get_current_usage() == 0, (
+                "Idle offload mode can only be "
+                "used for one instance per process.")
+            context = allocator.use_memory_pool(tag="weights")
        else:
            from contextlib import nullcontext
            context = nullcontext()  # type: ignore
        with context:
            self.model_runner.load_model()
+            if envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
+                # save memory to host with lock
+                self.offload_vram()
+                self.reload_vram()
+
+    def offload_vram(self) -> None:
+        # free_bytes_before_offload = NPUPlatform.mem_get_info()[0]
+        allocator = CaMemAllocator.get_instance()
+        allocator.offload_vram(offload_tags=("weights", ))
+        # free_bytes_after_offload, total = NPUPlatform.mem_get_info()
+        # freed_bytes = free_bytes_after_offload - free_bytes_before_offload
+        # used_bytes = total - free_bytes_after_offload
+        # assert freed_bytes >= 0, "Memory usage increased after sleeping."
+        # logger.info(
+        #     "Offloading freed %.2f GiB memory, "
+        #     "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
+        #     used_bytes / GiB_bytes)
+
+    def reload_vram(self) -> bool:
+        allocator = CaMemAllocator.get_instance()
+        return allocator.reload_vram(tags=None)

    def compile_or_warm_up_model(self) -> None:
        # Note: need to adapt for graph mode.
@@ -351,6 +397,9 @@ class NPUWorker(WorkerBase):
        if self.vllm_config.model_config.enable_sleep_mode:
            allocator = CaMemAllocator.get_instance()
            context = allocator.use_memory_pool(tag="kv_cache")
+        elif envs_ascend.VLLM_ASCEND_ENABLE_IDLE_OFFLOAD:
+            allocator = CaMemAllocator.get_instance()
+            context = allocator.use_memory_pool(tag="kv_cache")
        else:
            from contextlib import nullcontext
            context = nullcontext()  # type: ignore