From 0918de58d5634d468549bd700a5693e1eea4f2ad Mon Sep 17 00:00:00 2001 From: LI SHENGYONG <49200266+shenchuxiaofugui@users.noreply.github.com> Date: Tue, 16 Dec 2025 10:59:30 +0800 Subject: [PATCH] [Bugfix] dynamic eplb does't use fused_alltoall (#4919) ### What this PR does / why we need it? The fused alltoall operator itself was not designed or implemented to handle the scenario where tensors are lists, but the weights for dynamic load balancing are in list form. Therefore, we have disabled this operator when using dynamic load balancing. - vLLM version: v0.12.0 - vLLM main: https://github.com/vllm-project/vllm/commit/ad32e3e19ccf0526cb6744a5fed09a138a5fb2f9 Signed-off-by: shenchuxiaofugui <1311027364@qq.com> --- vllm_ascend/worker/model_runner_v1.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 5af8af4e..77c53cf0 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1434,10 +1434,13 @@ class NPUModelRunner(GPUModelRunner): moe_comm_type = MoECommType.ALLGATHER elif soc_version in {AscendDeviceType._910_93}: - moe_comm_type = ( - MoECommType.MC2 if num_tokens <= mc2_tokens_capacity else - MoECommType.FUSED_ALLTOALL if quant_type == "w8a8_dynamic" - and get_ep_group().world_size <= 16 else MoECommType.ALLTOALL) + # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes + fused_all2all_enable = quant_type == "w8a8_dynamic" and get_ep_group( + ).world_size <= 16 and (not self.dynamic_eplb) + moe_comm_type = (MoECommType.MC2 + if num_tokens <= mc2_tokens_capacity else + MoECommType.FUSED_ALLTOALL + if fused_all2all_enable else MoECommType.ALLTOALL) else: raise ValueError(f"Unsupported soc_version: {soc_version}")