enable ep32 for dispatch_ffn_combine (#5787)
### What this PR does / why we need it? To support dispatch_ffn_combine ep32 enabled ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? Single operator tested --------- Signed-off-by: lhchg <lhao_cheng@163.com>
This commit is contained in:
@@ -244,7 +244,7 @@ def select_moe_comm_method(num_tokens: int,
|
||||
# TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
|
||||
# TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
|
||||
fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic"
|
||||
dispatch_ffn_combine_enable = get_ep_group().world_size <= 16 and (
|
||||
dispatch_ffn_combine_enable = get_ep_group().world_size <= 32 and (
|
||||
not is_draft_model) and (not dynamic_eplb)
|
||||
if num_tokens <= mc2_tokens_capacity:
|
||||
fused_decode_enable = fused_mc2_enable
|
||||
|
||||
@@ -123,7 +123,7 @@ env_variables: Dict[str, Callable[[], Any]] = {
|
||||
# Whether to enable fused mc2(`dispatch_gmm_combine_decode`/`dispatch_ffn_combine` operator)
|
||||
# 0, or not set: default ALLTOALL and MC2 will be used.
|
||||
# 1: ALLTOALL and MC2 might be replaced by `dispatch_ffn_combine` operator.
|
||||
# `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=16, non-mtp, non-dynamic-eplb.
|
||||
# `dispatch_ffn_combine` can be used only for moe layer with W8A8, EP<=32, non-mtp, non-dynamic-eplb.
|
||||
# 2: MC2 might be replaced by `dispatch_gmm_combine_decode` operator.
|
||||
# `dispatch_gmm_combine_decode` can be used only for **decode node** moe layer
|
||||
# with W8A8. And MTP layer must be W8A8.
|
||||
|
||||
Reference in New Issue
Block a user