【main】SP For Qwen3 MoE (#2209)

### What this PR does / why we need it? Qwen3 MoE supports SP. In scenarios like AlltoAll, AlltoAllv, and MC2, replacing AllReduce with Reduce-Scatter and AllGather achieves computational benefits in norm operations while saving one AllGather communication. This feature is enabled during the P-phase and delivers notable gains in long-sequence scenarios (e.g., 16k–25k), with performance improvements reaching 5%–10%. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ``` compilation_config={ "pass_config":{ "enable_sequence_parallelism": True } }, enable_expert_parallel=True, ``` - vLLM version: v0.10.0 - vLLM main: 9edd1db02b --------- Signed-off-by: libaokui <libaokui@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com>
2025-08-07 09:15:49 +08:00
parent 57b9f02185
commit c611291661
11 changed files with 299 additions and 11 deletions
--- a/vllm_ascend/models/qwen3_moe.py
+++ b/vllm_ascend/models/qwen3_moe.py
@@ -16,14 +16,15 @@
 # limitations under the License.
 # Adapted from vllm/model_executor/models/qwen3_moe.py
 # This file is a part of the vllm-ascend project.
-from typing import Optional
+
+from typing import Optional, Union

 import torch
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, CompilationLevel, VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import (get_dp_group, get_ep_group,
                                             get_tp_group)
 from vllm.forward_context import get_forward_context
@@ -44,8 +45,11 @@ from vllm.model_executor.models.qwen3_moe import (Qwen3MoeAttention,
 from vllm.model_executor.models.utils import (
    PPMissingLayer, extract_layer_index,
    make_empty_intermediate_tensors_factory, make_layers, maybe_prefix)
+from vllm.sequence import IntermediateTensors

 from vllm_ascend.ops.fused_moe import AscendFusedMoE
+from vllm_ascend.ops.sequence_parallel import (MetadataForPadding,
+                                               init_metadata_for_sp)


 class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
@@ -96,6 +100,7 @@ class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
        self,
        hidden_states,
        attn_metadata=None,
+        _metadata_for_padding: Optional[MetadataForPadding] = None,
    ):
        if attn_metadata is None:
            attn_metadata = get_forward_context().attn_metadata
@@ -114,6 +119,7 @@ class CustomSparseMoeBlock(Qwen3MoeSparseMoeBlock):
            top_k=self.top_k,
            enable_force_load_balance=enable_force_load_balance,
            shared_experts=None,
+            _metadata_for_padding=_metadata_for_padding,
        )

        return hidden_states
@@ -155,14 +161,14 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
        layer_idx = extract_layer_index(prefix)
        mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
                           config.mlp_only_layers)
-        use_aclgraph = (vllm_config is not None
-                        and vllm_config.compilation_config.level
-                        == CompilationLevel.PIECEWISE
-                        and not vllm_config.model_config.enforce_eager)
+        self.use_aclgraph = (vllm_config is not None
+                             and vllm_config.compilation_config.level
+                             == CompilationLevel.PIECEWISE
+                             and not vllm_config.model_config.enforce_eager)
        if (layer_idx not in mlp_only_layers) and (
                config.num_experts > 0 and
            (layer_idx + 1) % config.decoder_sparse_step == 0):
-            if not use_aclgraph:
+            if not self.use_aclgraph:
                # FIXME: custom sparse moe block doesn't work with aclgraph.
                self.mlp = CustomSparseMoeBlock(config=config,
                                                quant_config=quant_config,
@@ -182,6 +188,60 @@ class CustomQwen3MoeDecoderLayer(Qwen3MoeDecoderLayer):
        self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                eps=config.rms_norm_eps)

+        self.enable_sequence_parallelism = (
+            vllm_config.compilation_config.pass_config.
+            enable_sequence_parallelism if vllm_config is not None else False)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        _metadata_for_padding: Optional[MetadataForPadding] = None,
+    ) -> torch.Tensor:
+
+        # To prevent precision issues during the decoder phase when only prefilling enables SP
+        if not self.enable_sequence_parallelism:
+            self.self_attn.o_proj.reduce_results = True
+        else:
+            self.self_attn.o_proj.reduce_results = not _metadata_for_padding.not_dummy_and_is_prefill if _metadata_for_padding is not None else True
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill:
+                residual = _metadata_for_padding.padding_slice(residual)
+
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+            if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill:
+                hidden_states = _metadata_for_padding.allgather_unpadding_aligned(
+                    hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill:
+            hidden_states = _metadata_for_padding.padding_aligned_reduce_scatter(
+                hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if not self.use_aclgraph:
+            hidden_states = self.mlp(
+                hidden_states, _metadata_for_padding=_metadata_for_padding)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+

@support_torch_compile
 class CustomQwen3MoeModel(Qwen3MoeModel):
@@ -216,6 +276,45 @@ class CustomQwen3MoeModel(Qwen3MoeModel):
            make_empty_intermediate_tensors_factory(
                ["hidden_states", "residual"], config.hidden_size))

+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        _metadata_for_padding: Optional[MetadataForPadding] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                _metadata_for_padding=_metadata_for_padding)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        if _metadata_for_padding and _metadata_for_padding.not_dummy_and_is_prefill:
+            hidden_states = _metadata_for_padding.allgather_unpadding_aligned(
+                hidden_states)
+
+        return hidden_states
+

 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
    packed_modules_mapping = {
@@ -253,6 +352,7 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)

+        self.enable_sequence_parallelism = vllm_config.compilation_config.pass_config.enable_sequence_parallelism
        # Set MoE hyperparameters
        self.expert_weights: list[torch.Tensor] = []

@@ -273,3 +373,16 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
        self.num_moe_layers = len(self.moe_layers)
        self.num_expert_groups = 1
        self.num_shared_experts = 0
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        _metadata_for_padding = init_metadata_for_sp(
+            input_ids, self.enable_sequence_parallelism)
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, _metadata_for_padding)
+        return hidden_states