From 2bb7e55022c3a558145a1b17ba3c93b4ab6bf00f Mon Sep 17 00:00:00 2001 From: CaveNightingale <51874645+CaveNightingale@users.noreply.github.com> Date: Fri, 15 Aug 2025 16:59:52 +0800 Subject: [PATCH] [Bugfix][PD]fix non-working disaggregated prefill (#2374) ### What this PR does / why we need it? Mainline vLLM fixes its disaggregated prefill in https://github.com/vllm-project/vllm/pull/22598 . But it is still not working in vllm-ascend. To be concrete, decoder instances crash before vllm's fix and hang after vllm's fix in ascend devices. This patch allows disaggregated prefill to work. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Qwen3-0.6B 1P1D tp=1 dp=1 - vLLM version: v0.10.0 - vLLM main: https://github.com/vllm-project/vllm/commit/0fe85087a950f3ca94d60293c865c0e6c05e6eff --------- Signed-off-by: CaveNightingale --- vllm_ascend/worker/model_runner_v1.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 14f89bb..ebf76eb 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1636,7 +1636,7 @@ class NPUModelRunner(LoRAModelRunnerMixin): finished_recving) = (self._process_reqs(scheduler_output, intermediate_tensors)) kv_connector_output = None - if finished_sending is not None and finished_recving is not None: + if finished_sending is not None or finished_recving is not None: kv_connector_output = KVConnectorOutput( finished_sending=finished_sending, finished_recving=finished_recving) @@ -1838,8 +1838,9 @@ class NPUModelRunner(LoRAModelRunnerMixin): return EMPTY_MODEL_RUNNER_OUTPUT output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - output.finished_sending = finished_sending - output.finished_recving = finished_recving + output.kv_connector_output = KVConnectorOutput( + finished_sending=finished_sending, + finished_recving=finished_recving) return output @staticmethod