[Bugfix] Route requests requiring KVC recomputation from the decode instance to the P instance (#3448)

### What this PR does / why we need it? This PR is aimed to fix the recomputing out of memory bug in decode instance. When recomputing happens in decode, kv cache usage may exceed the pre-allocated memory, and it will cause OOM. So we propose a new scheduling strategy, when decode instance cannot allocate new block for running requests, we will stop the request that will be preempted. These stopped request will be recognied by proxy, and they will be send to prefill instance again to calculate kvc and then direct to decode instance. This is a temporary plan to fix the bug. The long-term stratege is to use CPU offload in decode instance. ### Does this PR introduce _any_ user-facing change? An extra ascend configuration option **-- recompute_scheduler_enable = True** is added to enable this strategy. The default value is False ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: CHEN <116010019@link.cuhk.edu.cn>
2025-10-18 15:56:44 +08:00
parent 4750d45d86
commit b4233a2ec3
6 changed files with 1761 additions and 114 deletions
--- a/vllm_ascend/ascend_config.py
+++ b/vllm_ascend/ascend_config.py
@@ -70,6 +70,8 @@ class AscendConfig:
        ) and not self.torchair_graph_config.enabled and vllm_config.parallel_config.enable_expert_parallel
        self.multistream_overlap_shared_expert = additional_config.get(
            "multistream_overlap_shared_expert", False)
+        self.recompute_scheduler_enable = additional_config.get(
+            "recompute_scheduler_enable", False)
        self.lmhead_tensor_parallel_size = additional_config.get(
            "lmhead_tensor_parallel_size", None)
        if self.lmhead_tensor_parallel_size is not None:
--- a/vllm_ascend/core/recompute_schedule_config.py
+++ b/vllm_ascend/core/recompute_schedule_config.py
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+from dataclasses import dataclass, fields
+from typing import Type, Union
+
+from vllm.config import SchedulerConfig
+
+MAX_INT = 2147483647
+
+
+@dataclass
+class RecomputeSchedulerConfig(SchedulerConfig):
+    scheduler_cls: Union[str, Type[object]] = (
+        "vllm_ascend.core.recompute_scheduler.RecomputeScheduler")
+
+    @classmethod
+    def initialize_from_config(cls, vllm_scheduler_config: SchedulerConfig):
+        scheduler_config = {
+            field.name: getattr(vllm_scheduler_config, field.name)
+            for field in fields(vllm_scheduler_config) if field.init
+        }
+        scheduler_config["scheduler_cls"] = (
+            "vllm_ascend.core.recompute_scheduler.RecomputeScheduler")
+        return cls(**scheduler_config)
--- a/vllm_ascend/core/recompute_scheduler.py
+++ b/vllm_ascend/core/recompute_scheduler.py
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -300,6 +300,12 @@ class NPUPlatform(Platform):
                vllm_config.scheduler_config,
                ascend_config.ascend_scheduler_config)
            vllm_config.scheduler_config = ascend_scheduler_config
+        elif ascend_config.recompute_scheduler_enable:
+            from vllm_ascend.core.recompute_schedule_config import \
+                RecomputeSchedulerConfig
+            recompute_scheduler_config = RecomputeSchedulerConfig.initialize_from_config(
+                vllm_config.scheduler_config)
+            vllm_config.scheduler_config = recompute_scheduler_config

    @classmethod
    def get_attn_backend_cls(