From 1225c613fb95153ea838b3d43de7d5e755bd65aa Mon Sep 17 00:00:00 2001 From: pz1116 <47019764+Pz1116@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:57:09 +0800 Subject: [PATCH] [BugFix][0.18.0][KV Pool] Fix KV Pool not putting kv cache for vllm v0.18.0 (#7874) ### What this PR does / why we need it? vLLM v0.18 defers KV connector finalization during target-modelforward when speculative decoding is enable, leading to KV Pool not doing Put Operation. This change is forgotten when we bumpped up the version for vllm-ascend. Fix by adding finalize_kv_connector for spec decode. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Signed-off-by: Pz1116 Co-authored-by: DreamerLeader <2270923832@qq.com> Co-authored-by: fems14 <1804143737@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7d9710f1..7b546ec1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1577,8 +1577,11 @@ class NPUModelRunner(GPUModelRunner): # tokens on the CPU, so they are run after bookkeeping. propose_draft_token_ids(valid_sampled_token_ids) - if has_kv_transfer_group(): - get_kv_transfer_group().clear_connector_metadata() + # vLLM v0.18 defers KV connector finalization during target-model + # forward when speculative decoding is enabled. Finalize here after + # draft model runs so KV pool save/put can complete. + if self.speculative_config is not None: + self.finalize_kv_connector() if self.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance()