support fused_moe_allgather_ep (#1335)

### What this PR does / why we need it?
support fused_moe_allgather_ep

### How was this patch tested?
It was tested by UT.

Signed-off-by: lyj-jjj <liuyingjun5@huawei.com>
This commit is contained in:
lyj-jjj
2025-06-23 22:03:38 +08:00
committed by GitHub
parent 917c6b71af
commit 5177bef87a
5 changed files with 218 additions and 14 deletions

View File

@@ -394,11 +394,18 @@ class FusedMoEState(Enum):
AllGather = 0
All2All = 1
MC2 = 2
AllGatherEP = 3
# TODO(zzzzwwjj): add soc_version to choose branch
def get_fused_moe_state(ep_size: int, with_prefill: bool):
if ep_size == 1:
def get_fused_moe_state(ep_size: int, with_prefill: bool,
is_deepseek_v3_r1: bool):
# the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
# only supports deepseek v3/r1
if (envs.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
and is_deepseek_v3_r1):
return FusedMoEState.AllGatherEP
elif ep_size == 1:
return FusedMoEState.AllGather
# NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
elif ep_size < 16 or with_prefill: