[main][prefill optimization] Optimize parallel strategies to reduce communication overhead (#2198)

### What this PR does / why we need it? 1.Shared Expert Sharding Strategy Update: Switched from TP-aligned to pure DP for shared experts, enabling more efficient execution. 2.O_Proj AllReduce → ReduceScatter: Reduced communication overhead by using ReduceScatter, made possible by pure DP sharding. 3.AllGather Postponed: Delayed to after QKV down projection to reduce synchronization impact during prefill. ### How was this patch tested? Adding ut case in `tests/ut/attention/test_mla_v1.py` #### How to run use parameter `--additional_config='{"enable_shared_expert_dp": true}'` ##### a.How to run eager mode eg: python -m vllm.entrypoints.openai.api_server --model=/model_path --trust-remote-code -tp 8 -dp 2 --enable_expert_parallel --port 8002 --max-model-len 5120 --max-num-batched-tokens 16384 --enforce-eager --disable-log-requests --additional_config='{"ascend_scheduler_config":{"enabled":true},"enable_shared_expert_dp": true,"chunked_prefill_for_mla":true}' ##### b.How to run graph mode eg: python -m vllm.entrypoints.openai.api_server --model=/model_path --trust-remote-code -tp 8 -dp 2 --enable_expert_parallel --port 8002 --max-model-len 5120 --max-num-batched-tokens 16384 --disable-log-requests --additional_config='{"ascend_scheduler_config":{"enabled":true},"enable_shared_expert_dp": true,"chunked_prefill_for_mla":true,"torchair_graph_config":{"enabled":true}}' - vLLM version: v0.10.0 - vLLM main: 9edd1db02b --------- Signed-off-by: Wang Kunpeng <1289706727@qq.com> Signed-off-by: SlightwindSec <slightwindsec@gmail.com> Co-authored-by: SlightwindSec <slightwindsec@gmail.com>
2025-08-12 14:12:12 +08:00
parent 81817908ca
commit dc585f148a
6 changed files with 169 additions and 37 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -1268,6 +1268,7 @@ class AscendFusedMoE(FusedMoE):
        self.enable_multistream_moe = \
            ascend_config.torchair_graph_config.enable_multistream_moe and \
            self.torchair_graph_enabled
+        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp

        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
@@ -1408,22 +1409,24 @@ class AscendFusedMoE(FusedMoE):
            else:
                # TODO: Determine if we can remove the padding
                padding_size = tp_size
-            if num_tokens < padding_size:
+            if num_tokens < padding_size and not self.enable_shared_expert_dp:
                hidden_states = nn.functional.pad(
                    hidden_states, (0, 0, 0, padding_size - num_tokens))
                router_logits = nn.functional.pad(
                    router_logits, (0, 0, 0, padding_size - num_tokens))
            if tp_size > 1:
-                chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                         tp_size,
-                                                         dim=0)
-                chunk_router_logits = torch.tensor_split(router_logits,
-                                                         tp_size,
-                                                         dim=0)
-                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
                tp_rank = get_tensor_model_parallel_rank()
-                hidden_states = chunk_hidden_states[tp_rank]
-                router_logits = chunk_router_logits[tp_rank]
+                if not self.enable_shared_expert_dp:
+                    chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                             tp_size,
+                                                             dim=0)
+                    chunk_router_logits = torch.tensor_split(router_logits,
+                                                             tp_size,
+                                                             dim=0)
+                    hidden_states = chunk_hidden_states[tp_rank]
+                    router_logits = chunk_router_logits[tp_rank]
+
+                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
                mc2_mask = chunk_mc2_mask[tp_rank]

        if self.dp_size > 1:
@@ -1490,7 +1493,7 @@ class AscendFusedMoE(FusedMoE):
        if (fused_moe_state not in [
                FusedMoEState.AllGather, FusedMoEState.AllGatherEP,
                FusedMoEState.NaiveMulticast
-        ] and not replace_allreduce):
+        ] and not replace_allreduce and not self.enable_shared_expert_dp):
            if tp_size > 1:
                dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                                self.tp_group)
@@ -1500,7 +1503,7 @@ class AscendFusedMoE(FusedMoE):
                final_hidden_states = e_hidden_states
            if num_tokens < padding_size:
                final_hidden_states = final_hidden_states[:num_tokens]
-        elif self.dp_size > 1:
+        elif self.dp_size > 1 and not self.enable_shared_expert_dp:
            if fused_moe_state == FusedMoEState.NaiveMulticast:
                start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
                    self.dp_rank - 1]