enable ep32 for dispatch_ffn_combine (#5787)

### What this PR does / why we need it?
To support dispatch_ffn_combine ep32 enabled

### Does this PR introduce _any_ user-facing change?
N/A

### How was this patch tested?
Single operator tested

---------

Signed-off-by: lhchg <lhao_cheng@163.com>
This commit is contained in:
lhchg
2026-01-13 14:35:52 +08:00
committed by GitHub
parent 84d4f474c0
commit 4b679984de
2 changed files with 2 additions and 2 deletions

View File

@@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int,
# TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
# TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic"
dispatch_ffn_combine_enable = get_ep_group().world_size <= 16 and (
dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (
not is_draft_model) and (not dynamic_eplb)
if num_tokens <= mc2_tokens_capacity:
fused_decode_enable = fused_mc2_enable

View File

@@ -123,7 +123,7 @@ env_variables: Dict[str, Callable[[], Any]] = {
# Whether to enable fused mc2(`dispatch_gmm_combine_decode`/`dispatch_ffn_combine` operator)
# 0, or not set: default ALLTOALL and MC2 will be used.
# 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator.
# `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=16, non-mtp, non-dynamic-eplb.
# `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb.
# 2: MC2 might be replaced by `dispatch_gmm_combine_decode` operator.
# `dispatch_gmm_combine_decode` can be used only for **decode node** moe layer
# with W8A8. And MTP layer must be W8A8.