[Bugfix] Fix sleep mode level 2 (#1376)
### What this PR does / why we need it?
For sleep mode level 2, we discarded model both weights and kv_cache,
but the problems is: When we discard weights, we also discard some
tensors representing the model state which we called
`model.named_buffers()`, such as: `running_mean / running_var` in
BatchNorm、rope cos-sin cache ... when we update weights, but forgot to
update buffers as well, this will lead to some unknown issue
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.10.2
- vLLM main:
5963b98b46
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -258,7 +258,7 @@ class TestNPUWorker(TestBase):
|
|||||||
# Create worker mock
|
# Create worker mock
|
||||||
with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
|
with patch.object(NPUWorker, "__init__", lambda x, **kwargs: None):
|
||||||
worker = NPUWorker()
|
worker = NPUWorker()
|
||||||
|
worker._sleep_saved_buffers = {}
|
||||||
# Test wake_up method
|
# Test wake_up method
|
||||||
worker.wake_up(tags=["test_tag"])
|
worker.wake_up(tags=["test_tag"])
|
||||||
|
|
||||||
|
|||||||
@@ -111,6 +111,9 @@ class NPUWorker(WorkerBase):
|
|||||||
init_cached_hf_modules()
|
init_cached_hf_modules()
|
||||||
|
|
||||||
self.profiler = self._init_profiler()
|
self.profiler = self._init_profiler()
|
||||||
|
if sleep_mode_enabled():
|
||||||
|
# Buffers saved before sleep
|
||||||
|
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||||
|
|
||||||
def sleep(self, level: int = 1) -> None:
|
def sleep(self, level: int = 1) -> None:
|
||||||
if not sleep_mode_enabled():
|
if not sleep_mode_enabled():
|
||||||
@@ -118,6 +121,13 @@ class NPUWorker(WorkerBase):
|
|||||||
"Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
|
"Sleep mode is not enabled. Please compile vllm-ascend with COMPILE_CUSTOM_KERNELS=1."
|
||||||
)
|
)
|
||||||
free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
|
free_bytes_before_sleep = NPUPlatform.mem_get_info()[0]
|
||||||
|
# Save the buffers before level 2 sleep
|
||||||
|
if level == 2:
|
||||||
|
model = self.model_runner.model
|
||||||
|
self._sleep_saved_buffers = {
|
||||||
|
name: buffer.cpu().clone()
|
||||||
|
for name, buffer in model.named_buffers()
|
||||||
|
}
|
||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
||||||
free_bytes_after_sleep, total = NPUPlatform.mem_get_info()
|
free_bytes_after_sleep, total = NPUPlatform.mem_get_info()
|
||||||
@@ -137,6 +147,14 @@ class NPUWorker(WorkerBase):
|
|||||||
allocator = CaMemAllocator.get_instance()
|
allocator = CaMemAllocator.get_instance()
|
||||||
allocator.wake_up(tags=tags)
|
allocator.wake_up(tags=tags)
|
||||||
|
|
||||||
|
# Restore the buffers after level 2 sleep
|
||||||
|
if len(self._sleep_saved_buffers):
|
||||||
|
model = self.model_runner.model
|
||||||
|
for name, buffer in model.named_buffers():
|
||||||
|
if name in self._sleep_saved_buffers:
|
||||||
|
buffer.data.copy_(self._sleep_saved_buffers[name].data)
|
||||||
|
self._sleep_saved_buffers = {}
|
||||||
|
|
||||||
def initialize_cache(self, num_gpu_blocks: int,
|
def initialize_cache(self, num_gpu_blocks: int,
|
||||||
num_cpu_blocks: int) -> None:
|
num_cpu_blocks: int) -> None:
|
||||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||||
|
|||||||
Reference in New Issue
Block a user