[P/D] force with_prefill true after allreduce in kv producer (#3768)
### What this PR does / why we need it?
force with_prefill true after allreduce in kv producer
- vLLM version: v0.11.0rc3
- vLLM main:
c9461e05a4
---------
Signed-off-by: liziyu <liziyu16@huawei.com>
This commit is contained in:
@@ -2645,6 +2645,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
|
||||
num_tokens = math.ceil(num_tokens / tp_size) * tp_size
|
||||
|
||||
# Force dummy run on prefill stage when this node is deemed as kv producer.
|
||||
if self.is_kv_producer and not self.is_kv_consumer:
|
||||
with_prefill = True
|
||||
|
||||
# Padding for DP
|
||||
(num_tokens, num_tokens_across_dp,
|
||||
with_prefill) = self._sync_metadata_across_dp(num_tokens,
|
||||
@@ -2693,10 +2697,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
||||
num_scheduled_tokens = np.array(num_scheduled_tokens_list,
|
||||
dtype=np.int32)
|
||||
|
||||
# Force dummy run on prefill stage when this node is deemed as kv producer.
|
||||
if self.is_kv_producer and not self.is_kv_consumer:
|
||||
with_prefill = True
|
||||
|
||||
if not self.in_profile_run and self.dynamic_eplb:
|
||||
self.eplb_updator.forward_before()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user