[Feature]EPLB:Adapt DispatchGmmCombineDecode operator to eplb tensor list and expert token numbers (#5552)
#### What this PR does / why we need it?
This PR adapt DispatchGmmCombineDecode operator to eplb tensor list and
expert token numbers.
This operator support gmm1, gmm2, gmm1Scale and gmm2Scale in format of
list.
This operator support couting how many token each local expert recieves
by expertTokensNum .
- vLLM version: v0.13.0
- vLLM main:
7157596103
More info about this operator, please refer to RFC: issue
https://github.com/vllm-project/vllm-ascend/issues/5476
This commit is contained in:
@@ -242,15 +242,14 @@ def select_moe_comm_method(num_tokens: int,
|
||||
ascend_config = get_ascend_config()
|
||||
dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
|
||||
# TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
|
||||
# TODO: drop dynamic_eplb guard when dispatch_gmm_combine_decode supports tensor list inputs
|
||||
# TODO: drop speculative method guard when dispatch_gmm_combine_decode supports w16a16
|
||||
fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic" and (
|
||||
not dynamic_eplb)
|
||||
fused_mc2_enable = envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 and quant_type == "w8a8_dynamic"
|
||||
dispatch_ffn_combine_enable = get_ep_group().world_size <= 16 and (
|
||||
not is_draft_model) and (not dynamic_eplb)
|
||||
if num_tokens <= mc2_tokens_capacity:
|
||||
fused_decode_enable = fused_mc2_enable
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
||||
fused_decode_enable = fused_mc2_enable and get_ep_group(
|
||||
).world_size <= 16 and (not is_draft_model)
|
||||
fused_decode_enable = fused_mc2_enable and dispatch_ffn_combine_enable
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
|
||||
fused_decode_enable = fused_mc2_enable and \
|
||||
speculative_enable_dispatch_gmm_combine_decode(vllm_config)
|
||||
@@ -258,8 +257,7 @@ def select_moe_comm_method(num_tokens: int,
|
||||
else:
|
||||
fused_prefill_enable = fused_mc2_enable
|
||||
if envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 1:
|
||||
fused_prefill_enable = fused_mc2_enable and get_ep_group(
|
||||
).world_size <= 16 and (not is_draft_model)
|
||||
fused_prefill_enable = fused_mc2_enable and dispatch_ffn_combine_enable
|
||||
elif envs_ascend.VLLM_ASCEND_ENABLE_FUSED_MC2 == 2:
|
||||
fused_prefill_enable = False
|
||||
moe_comm_type = MoECommType.FUSED_MC2 if fused_prefill_enable else MoECommType.ALLTOALL
|
||||
|
||||
Reference in New Issue
Block a user