【main】SP For Qwen3 MoE (#2209)

### What this PR does / why we need it?
Qwen3 MoE supports SP. In scenarios like AlltoAll, AlltoAllv, and MC2,
replacing AllReduce with Reduce-Scatter and AllGather achieves
computational benefits in norm operations while saving one AllGather
communication. This feature is enabled during the P-phase and delivers
notable gains in long-sequence scenarios (e.g., 16k–25k), with
performance improvements reaching 5%–10%.
### Does this PR introduce _any_ user-facing change?

### How was this patch tested?
``` 
compilation_config={
    "pass_config":{
        "enable_sequence_parallelism": True
    }
},
enable_expert_parallel=True,
```

- vLLM version: v0.10.0
- vLLM main:
9edd1db02b

---------

Signed-off-by: libaokui <libaokui@huawei.com>
Co-authored-by: libaokui <libaokui@huawei.com>
This commit is contained in:
lbk-sys
2025-08-07 09:15:49 +08:00
committed by GitHub
parent 57b9f02185
commit c611291661
11 changed files with 299 additions and 11 deletions

View File

@@ -47,6 +47,7 @@ from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig)
from vllm_ascend.ops.sequence_parallel import MetadataForPadding
from vllm_ascend.torchair.utils import npu_stream_switch, npu_wait_tensor
from vllm_ascend.utils import (AscendSocVersion, dispose_tensor,
get_all_reduce_merge_state,
@@ -1347,7 +1348,8 @@ class AscendFusedMoE(FusedMoE):
top_k: Optional[int] = None,
shared_experts: Optional[Any] = None,
gate=None,
replace_allreduce: bool = False):
replace_allreduce: bool = False,
_metadata_for_padding: Optional[MetadataForPadding] = None):
assert self.quant_method is not None
@@ -1381,7 +1383,17 @@ class AscendFusedMoE(FusedMoE):
# When all_reduce_merge is in progress, shared_experts does not do all_reduce in mlp, but waits until shared_experts+router_experts are completed before doing all_reduce
shared_hidden_states = shared_experts(hidden_states)
mc2_mask = forward_context.mc2_mask
enable_sp = _metadata_for_padding is not None and _metadata_for_padding.not_dummy_and_is_prefill
tp_size = get_tensor_model_parallel_world_size()
if enable_sp:
tp_rank = get_tensor_model_parallel_rank()
mc2_mask_sp = _metadata_for_padding.mc2_mask if _metadata_for_padding is not None else forward_context.mc2_mask
chunk_mc2_mask = torch.tensor_split(mc2_mask_sp, tp_size, dim=0)
mc2_mask = chunk_mc2_mask[tp_rank]
replace_allreduce = True
if (fused_moe_state not in [
FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
FusedMoEState.NaiveMulticast

View File

@@ -0,0 +1,120 @@
import torch
from torch.nn import functional as F
from vllm.distributed import (get_tensor_model_parallel_world_size,
get_tp_group, tensor_model_parallel_all_gather,
tensor_model_parallel_reduce_scatter)
from vllm.forward_context import get_forward_context
from vllm_ascend.platform import NPUPlatform
class MetadataForPadding:
def __init__(self,
padding_flag=False,
lengths_sum_padding=0,
lengths_sum_unpadding=0,
pad_size=0,
not_dummy_and_is_prefill=False):
self.padding_flag = padding_flag
self.not_dummy_and_is_prefill = not_dummy_and_is_prefill
self.lengths_sum_padding = lengths_sum_padding
self.lengths_sum_unpadding = lengths_sum_unpadding
self.pad_size = pad_size
self.tp_size = get_tp_group().world_size
self.tp_rank_in_group = get_tp_group().rank_in_group
assert self.lengths_sum_padding % self.tp_size == 0
self.slice_size = self.lengths_sum_padding // self.tp_size
self.mc2_mask = torch.zeros(
self.lengths_sum_padding,
dtype=torch.bool,
device=NPUPlatform.device_type,
)
self.mc2_mask[:lengths_sum_unpadding] = True
def padding_aligned_reduce_scatter(self,
data: torch.Tensor) -> torch.Tensor:
if self.padding_flag:
pad_size = self.pad_size
padded_data = F.pad(data, (0, 0, 0, pad_size))
else:
padded_data = data
padded_data_reduce_scatter = tensor_model_parallel_reduce_scatter(
padded_data, 0)
return padded_data_reduce_scatter
def allgather_unpadding_aligned(self,
padded_data: torch.Tensor) -> torch.Tensor:
padded_data_allgather = tensor_model_parallel_all_gather(
padded_data, 0)
if self.padding_flag:
lengths_sum_unpadding = self.lengths_sum_unpadding
unpadding_data = padded_data_allgather[:lengths_sum_unpadding]
else:
unpadding_data = padded_data_allgather
return unpadding_data
def padding_slice(self, data: torch.Tensor) -> torch.Tensor:
padded_data = F.pad(data, (0, 0, 0, self.pad_size))
start = self.tp_rank_in_group * self.slice_size
end = start + self.slice_size
slice_data = padded_data[start:end]
return slice_data
def padding_aligned_scatter(self, data: torch.Tensor) -> torch.Tensor:
if self.padding_flag:
pad_size = self.pad_size
padded_data = F.pad(data, (0, 0, 0, pad_size))
else:
padded_data = data
# padded_data = data
padded_data = torch.tensor_split(padded_data, self.tp_size, dim=0)
padded_data_reduce_scatter = padded_data[self.tp_rank_in_group]
return padded_data_reduce_scatter
def init_metadata_for_sp(input_ids, enable_sequence_parallelism):
if not enable_sequence_parallelism:
return MetadataForPadding(padding_flag=False,
not_dummy_and_is_prefill=False)
is_perifll = 0
attn_metadata = get_forward_context().attn_metadata
tp_size = get_tensor_model_parallel_world_size()
if attn_metadata is not None:
if hasattr(attn_metadata,
'is_only_prefill') and attn_metadata.is_only_prefill:
is_perifll = 1
if hasattr(attn_metadata,
'num_prefills') and attn_metadata.num_prefills > 0:
is_perifll = 1
if is_perifll:
lengths_sum_unpadding = input_ids.shape[0]
lengths_sum_padding = (
(lengths_sum_unpadding + tp_size - 1) // tp_size) * tp_size
if lengths_sum_unpadding == lengths_sum_padding:
padding_flag = False
else:
padding_flag = True
pad_size = lengths_sum_padding - lengths_sum_unpadding
_metadata_for_padding = MetadataForPadding(
lengths_sum_unpadding=lengths_sum_unpadding,
lengths_sum_padding=lengths_sum_padding,
padding_flag=padding_flag,
pad_size=pad_size,
not_dummy_and_is_prefill=True)
return _metadata_for_padding
return MetadataForPadding(padding_flag=False,
not_dummy_and_is_prefill=False)