perf: use multicast to avoid padding decode request to prefill size (#1555)
### What this PR does / why we need it?
perf: use multicast to avoid padding decode request to prefill size
### How was this patch tested?
- vLLM version: v0.9.1
- vLLM main:
1fd471e957
Signed-off-by: boying <897013703@qq.com>
This commit is contained in:
@@ -419,6 +419,7 @@ class FusedMoEState(Enum):
|
||||
All2All = 1
|
||||
MC2 = 2
|
||||
AllGatherEP = 3
|
||||
NaiveMulticast = 4
|
||||
|
||||
|
||||
# TODO(zzzzwwjj): add soc_version to choose branch
|
||||
@@ -430,7 +431,10 @@ def get_fused_moe_state(ep_size: int, with_prefill: bool,
|
||||
and is_deepseek_v3_r1):
|
||||
return FusedMoEState.AllGatherEP
|
||||
elif ep_size == 1:
|
||||
return FusedMoEState.AllGather
|
||||
if with_prefill:
|
||||
return FusedMoEState.NaiveMulticast
|
||||
else:
|
||||
return FusedMoEState.AllGather
|
||||
# NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
|
||||
elif ep_size < 16 or with_prefill:
|
||||
return FusedMoEState.All2All
|
||||
|
||||
Reference in New Issue
Block a user