【main】SP For Qwen3 MoE (#2209)
### What this PR does / why we need it?
Qwen3 MoE supports SP. In scenarios like AlltoAll, AlltoAllv, and MC2,
replacing AllReduce with Reduce-Scatter and AllGather achieves
computational benefits in norm operations while saving one AllGather
communication. This feature is enabled during the P-phase and delivers
notable gains in long-sequence scenarios (e.g., 16k–25k), with
performance improvements reaching 5%–10%.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
```
compilation_config={
"pass_config":{
"enable_sequence_parallelism": True
}
},
enable_expert_parallel=True,
```
- vLLM version: v0.10.0
- vLLM main:
9edd1db02b
---------
Signed-off-by: libaokui <libaokui@huawei.com>
Co-authored-by: libaokui <libaokui@huawei.com>
This commit is contained in:
@@ -151,6 +151,7 @@ class AscendMetadata:
|
||||
slot_mapping: torch.Tensor = None
|
||||
|
||||
enable_dbo_across_dp: bool = False
|
||||
is_only_prefill: bool = False
|
||||
|
||||
|
||||
class AscendAttentionMetadataBuilder:
|
||||
@@ -166,7 +167,8 @@ class AscendAttentionMetadataBuilder:
|
||||
num_reqs,
|
||||
num_actual_tokens,
|
||||
max_query_len,
|
||||
enable_dbo_across_dp: bool = False):
|
||||
enable_dbo_across_dp: bool = False,
|
||||
is_only_prefill: bool = False):
|
||||
|
||||
block_table = self.runner.input_batch.block_table[0].get_device_tensor(
|
||||
)
|
||||
@@ -203,7 +205,8 @@ class AscendAttentionMetadataBuilder:
|
||||
slot_mapping=slot_mapping,
|
||||
attn_mask=attn_mask,
|
||||
attn_state=attn_state,
|
||||
enable_dbo_across_dp=enable_dbo_across_dp)
|
||||
enable_dbo_across_dp=enable_dbo_across_dp,
|
||||
is_only_prefill=is_only_prefill)
|
||||
return attn_metadata
|
||||
|
||||
|
||||
|
||||
@@ -223,7 +223,9 @@ class AscendAttentionTorchairMetadataBuilder:
|
||||
num_actual_tokens,
|
||||
max_query_len,
|
||||
graph_pad_size: int = -1,
|
||||
enable_dbo_across_dp: bool = False):
|
||||
enable_dbo_across_dp: bool = False,
|
||||
*args,
|
||||
**kwargs):
|
||||
|
||||
device = self.runner.device
|
||||
|
||||
|
||||
@@ -384,6 +384,8 @@ class AscendMLAMetadataBuilder:
|
||||
graph_pad_size: int = -1,
|
||||
query_start_loc: torch.Tensor = None,
|
||||
enable_dbo_across_dp: bool = False,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> AscendMLAMetadata:
|
||||
assert self._num_decodes + self._num_prefills == num_reqs
|
||||
|
||||
|
||||
Reference in New Issue
Block a user