diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index dc3edaa..ccbd3ae 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -258,7 +258,7 @@ class TestNPUWorker(TestBase): # Create worker mock with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None): worker = NPUWorker() - + worker._sleep_saved_buffers = {} # Test wake_up method worker.wake_up(tags=["test_tag"]) diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index 0a03e4a..6298d34 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -111,6 +111,9 @@ class NPUWorker(WorkerBase): init_cached_hf_modules() self.profiler = self._init_profiler() + if sleep_mode_enabled(): + # Buffers saved before sleep + self._sleep_saved_buffers: dict[str, torch.Tensor] = {} def sleep(self, level: int = 1) -> None: if not sleep_mode_enabled(): @@ -118,6 +121,13 @@ class NPUWorker(WorkerBase): "Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1." ) free_bytes_before_sleep = NPUPlatform.mem_get_info()[0] + # Save the buffers before level 2 sleep + if level == 2: + model = self.model_runner.model + self._sleep_saved_buffers = { + name: buffer.cpu().clone() + for name, buffer in model.named_buffers() + } allocator = CaMemAllocator.get_instance() allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) free_bytes_after_sleep, total = NPUPlatform.mem_get_info() @@ -137,6 +147,14 @@ class NPUWorker(WorkerBase): allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + # Restore the buffers after level 2 sleep + if len(self._sleep_saved_buffers): + model = self.model_runner.model + for name, buffer in model.named_buffers(): + if name in self._sleep_saved_buffers: + buffer.data.copy_(self._sleep_saved_buffers[name].data) + self._sleep_saved_buffers = {} + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks