diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7d9710f1..7b546ec1 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1577,8 +1577,11 @@ class NPUModelRunner(GPUModelRunner): # tokens on the CPU, so they are run after bookkeeping. propose_draft_token_ids(valid_sampled_token_ids) - if has_kv_transfer_group(): - get_kv_transfer_group().clear_connector_metadata() + # vLLM v0.18 defers KV connector finalization during target-model + # forward when speculative decoding is enabled. Finalize here after + # draft model runs so KV pool save/put can complete. + if self.speculative_config is not None: + self.finalize_kv_connector() if self.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance()