### What this PR does / why we need it?
Cherry pick #1291 from v0.9.1-dev, This pr implement the synchronization
of whether `dbo` is enabled across all dp ranks. specifically, it
performed allreduce op across multiple DP ranks, only when all the dp
rank is `enable_dbo`, it is enabled
Co-authored-by: shikang-hangzhou <459956190@qq.com>
Co-authored-by: wangli <wangli858794774@gmail.com>
- vLLM version: v0.10.0
- vLLM main:
2836dd73f1
---------
Signed-off-by: wangli <wangli858794774@gmail.com>
This commit is contained in:
@@ -75,7 +75,6 @@ from vllm_ascend.multistream.layers import (MultiStreamPostTransformerLayer,
|
||||
from vllm_ascend.multistream.metadata import (MultiStreamConfig,
|
||||
MultiStreamStepMetadata,
|
||||
make_multistream_metadata_ds)
|
||||
from vllm_ascend.multistream.ms_split import compute_split_seq_index
|
||||
from vllm_ascend.ops.fused_moe import AscendFusedMoE
|
||||
from vllm_ascend.utils import dispose_tensor
|
||||
|
||||
@@ -872,24 +871,9 @@ class CustomDeepseekDBOModel(nn.Module):
|
||||
|
||||
def can_run_ms(self):
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
# support mla attention and V1 engine at present
|
||||
if not self.use_mla:
|
||||
return False
|
||||
# enable prefill overlap
|
||||
if attn_metadata is None or attn_metadata.num_prefills == 0:
|
||||
return False
|
||||
else:
|
||||
[token_index, seq_index
|
||||
] = compute_split_seq_index(attn_metadata.query_lens,
|
||||
attn_metadata.attn_state,
|
||||
attn_metadata.num_decode_tokens)
|
||||
if token_index == 0 or seq_index == 0 or seq_index == len(
|
||||
attn_metadata.query_lens):
|
||||
return False
|
||||
# check whether the total tokens exceed the threshold
|
||||
if self.multistream_config is None or attn_metadata.num_actual_tokens < self.multistream_config.min_total_tokens_to_split:
|
||||
return False
|
||||
return True
|
||||
return not (attn_metadata is None or attn_metadata.num_prefills == 0
|
||||
or not attn_metadata.enable_dbo_across_dp)
|
||||
|
||||
def _forward_ms_layers(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user