[Bugfix][PD]fix non-working disaggregated prefill (#2374)
### What this PR does / why we need it?
Mainline vLLM fixes its disaggregated prefill in
https://github.com/vllm-project/vllm/pull/22598 . But it is still not
working in vllm-ascend.
To be concrete, decoder instances crash before vllm's fix and hang after
vllm's fix in ascend devices.
This patch allows disaggregated prefill to work.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Qwen3-0.6B 1P1D tp=1 dp=1
- vLLM version: v0.10.0
- vLLM main:
0fe85087a9
---------
Signed-off-by: CaveNightingale <cavenightingale@foxmail.com>
This commit is contained in:
@@ -1636,7 +1636,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
finished_recving) = (self._process_reqs(scheduler_output,
|
finished_recving) = (self._process_reqs(scheduler_output,
|
||||||
intermediate_tensors))
|
intermediate_tensors))
|
||||||
kv_connector_output = None
|
kv_connector_output = None
|
||||||
if finished_sending is not None and finished_recving is not None:
|
if finished_sending is not None or finished_recving is not None:
|
||||||
kv_connector_output = KVConnectorOutput(
|
kv_connector_output = KVConnectorOutput(
|
||||||
finished_sending=finished_sending,
|
finished_sending=finished_sending,
|
||||||
finished_recving=finished_recving)
|
finished_recving=finished_recving)
|
||||||
@@ -1838,8 +1838,9 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
return EMPTY_MODEL_RUNNER_OUTPUT
|
return EMPTY_MODEL_RUNNER_OUTPUT
|
||||||
|
|
||||||
output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
|
output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
|
||||||
output.finished_sending = finished_sending
|
output.kv_connector_output = KVConnectorOutput(
|
||||||
output.finished_recving = finished_recving
|
finished_sending=finished_sending,
|
||||||
|
finished_recving=finished_recving)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
Reference in New Issue
Block a user