[Worker][V1] Support sleep mode for v1 (#1084)

### What this PR does / why we need it? Support sleep mode for v1 Signed-off-by: wangli <wangli858794774@gmail.com>
2025-06-06 21:54:02 +08:00
parent 0395ab30be
commit a2552e10e4
5 changed files with 65 additions and 60 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1235,11 +1235,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        # assert self.lora_manager is not None, "LoRA is not enabled"
        # TODO: call maybe_profile_with_lora()

-        dummy_kv_caches = [
-            torch.tensor((), dtype=torch.float32, device=self.device)
-            for _ in range(self.num_attn_layers)
-        ]
-
        # Trigger compilation for general shape.
        hidden_states = self._dummy_run(self.max_num_tokens)

@@ -1250,7 +1245,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            logits = None

        NPUPlatform.synchronize()
-        del hidden_states, logits, dummy_kv_caches
+        del hidden_states, logits
        self.encoder_cache.clear()
        gc.collect()