[v0.11.0] [P/D] force with_prefill true after allreduce in kv producer (#3835)

### What this PR does / why we need it?
force with_prefill true after allreduce in kv producer. This is a backport of #3768 and #3849

---------

Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
liziyu
2025-10-29 23:14:00 +08:00
committed by GitHub
parent b323be9fe4
commit e5b938c5fe
2 changed files with 6 additions and 5 deletions

View File

@@ -1145,7 +1145,8 @@ class MooncakeLayerwiseConnectorWorker:
connector_metadata: MooncakeLayerwiseConnectorMetadata,
**kwargs) -> None:
"""MooncakeLayerwiseConnector does not save explicitly."""
if self.kv_role == 'kv_producer':
if self.kv_role == 'kv_producer' and connector_metadata.requests.keys(
):
if self.pd_head_ratio != 1:
if self.current_layer != 0:
self.completion_event.wait()

View File

@@ -2364,6 +2364,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
num_tokens = math.ceil(num_tokens / tp_size) * tp_size
# Force dummy run on prefill stage when this node is deemed as kv producer.
if self.is_kv_producer and not self.is_kv_consumer:
with_prefill = True
# Padding for DP
(num_tokens, num_tokens_across_dp, with_prefill,
_) = self._sync_metadata_across_dp(num_tokens, with_prefill, False)
@@ -2411,10 +2415,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
dtype=np.int32)
# Force dummy run on prefill stage when this node is deemed as kv producer.
if self.is_kv_producer and not self.is_kv_consumer:
with_prefill = True
if not self.in_profile_run and self.dynamic_eplb:
self.eplb_updator.forward_before()