perf: use multicast to avoid padding decode request to prefill size (#1555)

### What this PR does / why we need it? perf: use multicast to avoid padding decode request to prefill size ### How was this patch tested? - vLLM version: v0.9.1 - vLLM main: 1fd471e957 Signed-off-by: boying <897013703@qq.com>
2025-07-07 22:36:03 +08:00
parent f08c4f15a2
commit df84cceca8
3 changed files with 81 additions and 34 deletions
--- a/vllm_ascend/utils.py
+++ b/vllm_ascend/utils.py
@@ -419,6 +419,7 @@ class FusedMoEState(Enum):
    All2All = 1
    MC2 = 2
    AllGatherEP = 3
+    NaiveMulticast = 4


 # TODO(zzzzwwjj): add soc_version to choose branch
@@ -430,7 +431,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
            and is_deepseek_v3_r1):
        return FusedMoEState.AllGatherEP
    elif ep_size == 1:
-        return FusedMoEState.AllGather
+        if with_prefill:
+            return FusedMoEState.NaiveMulticast
+        else:
+            return FusedMoEState.AllGather
    # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
    elif ep_size < 16 or with_prefill:
        return FusedMoEState.All2All