From c76db627ab94bdd37c56c1e859844312c2b43d0e Mon Sep 17 00:00:00 2001 From: liziyu <56102866+liziyu179@users.noreply.github.com> Date: Wed, 29 Oct 2025 10:15:38 +0800 Subject: [PATCH] [P/D] force with_prefill true after allreduce in kv producer (#3768) ### What this PR does / why we need it? force with_prefill true after allreduce in kv producer - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/c9461e05a4ed3557cfbf4b15ded1e26761cc39ca --------- Signed-off-by: liziyu --- vllm_ascend/distributed/mooncake_layerwise_connector.py | 2 +- vllm_ascend/worker/model_runner_v1.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_ascend/distributed/mooncake_layerwise_connector.py b/vllm_ascend/distributed/mooncake_layerwise_connector.py index a45aaa17..f379e426 100644 --- a/vllm_ascend/distributed/mooncake_layerwise_connector.py +++ b/vllm_ascend/distributed/mooncake_layerwise_connector.py @@ -1151,7 +1151,7 @@ class MooncakeLayerwiseConnectorWorker: connector_metadata: MooncakeLayerwiseConnectorMetadata, **kwargs) -> None: """MooncakeLayerwiseConnector does not save explicitly.""" - if self.kv_role == 'kv_producer': + if self.kv_role == 'kv_producer' and connector_metadata.request.keys(): if self.pd_head_ratio != 1: if self.current_layer != 0: self.completion_event.wait() diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index fbf1d3e2..522c3efc 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -2645,6 +2645,10 @@ class NPUModelRunner(LoRAModelRunnerMixin): tp_size = self.vllm_config.parallel_config.tensor_parallel_size num_tokens = math.ceil(num_tokens / tp_size) * tp_size + # Force dummy run on prefill stage when this node is deemed as kv producer. + if self.is_kv_producer and not self.is_kv_consumer: + with_prefill = True + # Padding for DP (num_tokens, num_tokens_across_dp, with_prefill) = self._sync_metadata_across_dp(num_tokens, @@ -2693,10 +2697,6 @@ class NPUModelRunner(LoRAModelRunnerMixin): num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - # Force dummy run on prefill stage when this node is deemed as kv producer. - if self.is_kv_producer and not self.is_kv_consumer: - with_prefill = True - if not self.in_profile_run and self.dynamic_eplb: self.eplb_updator.forward_before()