【main】SP For Qwen3 MoE (#2209)

### What this PR does / why we need it? Qwen3 MoE supports SP. In scenarios like AlltoAll, AlltoAllv, and MC2, replacing AllReduce with Reduce-Scatter and AllGather achieves computational benefits in norm operations while saving one AllGather communication. This feature is enabled during the P-phase and delivers notable gains in long-sequence scenarios (e.g., 16k–25k), with performance improvements reaching 5%–10%. ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? ``` compilation_config={ "pass_config":{ "enable_sequence_parallelism": True } }, enable_expert_parallel=True, ``` - vLLM version: v0.10.0 - vLLM main: 9edd1db02b --------- Signed-off-by: libaokui <libaokui@huawei.com> Co-authored-by: libaokui <libaokui@huawei.com>
2025-08-07 09:15:49 +08:00
parent 57b9f02185
commit c611291661
11 changed files with 299 additions and 11 deletions
--- a/vllm_ascend/attention/attention_v1.py
+++ b/vllm_ascend/attention/attention_v1.py
@@ -151,6 +151,7 @@ class AscendMetadata:
    slot_mapping: torch.Tensor = None

    enable_dbo_across_dp: bool = False
+    is_only_prefill: bool = False


 class AscendAttentionMetadataBuilder:
@@ -166,7 +167,8 @@ class AscendAttentionMetadataBuilder:
              num_reqs,
              num_actual_tokens,
              max_query_len,
-              enable_dbo_across_dp: bool = False):
+              enable_dbo_across_dp: bool = False,
+              is_only_prefill: bool = False):

        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
        )
@@ -203,7 +205,8 @@ class AscendAttentionMetadataBuilder:
            slot_mapping=slot_mapping,
            attn_mask=attn_mask,
            attn_state=attn_state,
-            enable_dbo_across_dp=enable_dbo_across_dp)
+            enable_dbo_across_dp=enable_dbo_across_dp,
+            is_only_prefill=is_only_prefill)
        return attn_metadata


--- a/vllm_ascend/attention/attention_v1_torchair.py
+++ b/vllm_ascend/attention/attention_v1_torchair.py
@@ -223,7 +223,9 @@ class AscendAttentionTorchairMetadataBuilder:
              num_actual_tokens,
              max_query_len,
              graph_pad_size: int = -1,
-              enable_dbo_across_dp: bool = False):
+              enable_dbo_across_dp: bool = False,
+              *args,
+              **kwargs):

        device = self.runner.device

--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -384,6 +384,8 @@ class AscendMLAMetadataBuilder:
        graph_pad_size: int = -1,
        query_start_loc: torch.Tensor = None,
        enable_dbo_across_dp: bool = False,
+        *args,
+        **kwargs,
    ) -> AscendMLAMetadata:
        assert self._num_decodes + self._num_prefills == num_reqs