[v0.11.0] [P/D] force with_prefill true after allreduce in kv producer (#3835)
### What this PR does / why we need it? force with_prefill true after allreduce in kv producer. This is a backport of #3768 and #3849 --------- Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -1145,7 +1145,8 @@ class MooncakeLayerwiseConnectorWorker:
|
|||||||
connector_metadata: MooncakeLayerwiseConnectorMetadata,
|
connector_metadata: MooncakeLayerwiseConnectorMetadata,
|
||||||
**kwargs) -> None:
|
**kwargs) -> None:
|
||||||
"""MooncakeLayerwiseConnector does not save explicitly."""
|
"""MooncakeLayerwiseConnector does not save explicitly."""
|
||||||
if self.kv_role == 'kv_producer':
|
if self.kv_role == 'kv_producer' and connector_metadata.requests.keys(
|
||||||
|
):
|
||||||
if self.pd_head_ratio != 1:
|
if self.pd_head_ratio != 1:
|
||||||
if self.current_layer != 0:
|
if self.current_layer != 0:
|
||||||
self.completion_event.wait()
|
self.completion_event.wait()
|
||||||
|
|||||||
@@ -2364,6 +2364,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
|
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
|
||||||
num_tokens = math.ceil(num_tokens / tp_size) * tp_size
|
num_tokens = math.ceil(num_tokens / tp_size) * tp_size
|
||||||
|
|
||||||
|
# Force dummy run on prefill stage when this node is deemed as kv producer.
|
||||||
|
if self.is_kv_producer and not self.is_kv_consumer:
|
||||||
|
with_prefill = True
|
||||||
|
|
||||||
# Padding for DP
|
# Padding for DP
|
||||||
(num_tokens, num_tokens_across_dp, with_prefill,
|
(num_tokens, num_tokens_across_dp, with_prefill,
|
||||||
_) = self._sync_metadata_across_dp(num_tokens, with_prefill, False)
|
_) = self._sync_metadata_across_dp(num_tokens, with_prefill, False)
|
||||||
@@ -2411,10 +2415,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
|
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
|
|
||||||
# Force dummy run on prefill stage when this node is deemed as kv producer.
|
|
||||||
if self.is_kv_producer and not self.is_kv_consumer:
|
|
||||||
with_prefill = True
|
|
||||||
|
|
||||||
if not self.in_profile_run and self.dynamic_eplb:
|
if not self.in_profile_run and self.dynamic_eplb:
|
||||||
self.eplb_updator.forward_before()
|
self.eplb_updator.forward_before()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user