perf: use multicast to avoid padding decode request to prefill size (#1555)

### What this PR does / why we need it?
perf: use multicast to avoid padding decode request to prefill size

### How was this patch tested?

- vLLM version: v0.9.1
- vLLM main:
1fd471e957

Signed-off-by: boying <897013703@qq.com>
This commit is contained in:
NeverRaR
2025-07-07 22:36:03 +08:00
committed by GitHub
parent f08c4f15a2
commit df84cceca8
3 changed files with 81 additions and 34 deletions

View File

@@ -419,6 +419,7 @@ class FusedMoEState(Enum):
All2All = 1
MC2 = 2
AllGatherEP = 3
NaiveMulticast = 4
# TODO(zzzzwwjj): add soc_version to choose branch
@@ -430,7 +431,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
and is_deepseek_v3_r1):
return FusedMoEState.AllGatherEP
elif ep_size == 1:
return FusedMoEState.AllGather
if with_prefill:
return FusedMoEState.NaiveMulticast
else:
return FusedMoEState.AllGather
# NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
elif ep_size < 16 or with_prefill:
return FusedMoEState.All2All