From 01592515b8265db7f2da22e0983e1f5ef6eead4c Mon Sep 17 00:00:00 2001 From: Li Wang Date: Thu, 18 Sep 2025 19:51:52 +0800 Subject: [PATCH] [Bugfix] Fix sleep mode level 2 (#1376) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What this PR does / why we need it? For sleep mode level 2, we discarded model both weights and kv_cache, but the problems is: When we discard weights, we also discard some tensors representing the model state which we called `model.named_buffers()`, such as: `running_mean / running_var` in BatchNorm、rope cos-sin cache ... when we update weights, but forgot to update buffers as well, this will lead to some unknown issue ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.10.2 - vLLM main: https://github.com/vllm-project/vllm/commit/5963b98b465007e3cfb0d39447e4459a8afa96dc --------- Signed-off-by: wangli --- tests/ut/worker/test_worker_v1.py | 2 +- vllm_ascend/worker/worker_v1.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index dc3edaa..ccbd3ae 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -258,7 +258,7 @@ class TestNPUWorker(TestBase): # Create worker mock with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): worker = NPUWorker() - + worker._sleep_saved_buffers = {} # Test wake_up method worker.wake_up(tags=["test_tag"]) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 0a03e4a..6298d34 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -111,6 +111,9 @@ class NPUWorker(WorkerBase): init_cached_hf_modules() self.profiler = self._init_profiler() + if sleep_mode_enabled(): + # Buffers saved before sleep + self._sleep_saved_buffers: dict[str, torch.Tensor] = {} def sleep(self, level: int = 1) -> None: if not sleep_mode_enabled(): @@ -118,6 +121,13 @@ class NPUWorker(WorkerBase): "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." ) free_bytes_before_sleep = NPUPlatform.mem_get_info()[0] + # Save the buffers before level 2 sleep + if level == 2: + model = self.model_runner.model + self._sleep_saved_buffers = { + name: buffer.cpu().clone() + for name, buffer in model.named_buffers() + } allocator = CaMemAllocator.get_instance() allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) free_bytes_after_sleep, total = NPUPlatform.mem_get_info() @@ -137,6 +147,14 @@ class NPUWorker(WorkerBase): allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + # Restore the buffers after level 2 sleep + if len(self._sleep_saved_buffers): + model = self.model_runner.model + for name, buffer in model.named_buffers(): + if name in self._sleep_saved_buffers: + buffer.data.copy_(self._sleep_saved_buffers[name].data) + self._sleep_saved_buffers = {} + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks