[Bugfix] Disable the dispatch_ffn_combine kernel in MTP path (#4751)

### What this PR does / why we need it? This PR is to fix a smoking test failure. Adjust mtp_proposer and model_runner_v1 to route MTP decoding through the non‑fused MoE implementation while keeping the overall inference flow unchanged. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c Signed-off-by: mojave2 <chenchen145@huawei.com> Co-authored-by: Mengqing Cao <cmq0113@163.com>
2025-12-09 22:14:05 +08:00
parent cd1c69ee0b
commit 848419d1ba
2 changed files with 12 additions and 7 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -52,8 +52,8 @@ from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                          has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
 from vllm.distributed.parallel_state import (get_dcp_group, get_dp_group,
-                                             get_pcp_group, get_pp_group,
-                                             get_tp_group,
+                                             get_ep_group, get_pcp_group,
+                                             get_pp_group, get_tp_group,
                                             is_global_first_rank)
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
@@ -2332,10 +2332,11 @@ class NPUModelRunner(LoRAModelRunnerMixin, ECConnectorModelRunnerMixin):
                    moe_comm_type = MoECommType.ALLGATHER

        elif soc_version in {AscendDeviceType._910_93}:
-            moe_comm_type = (MoECommType.MC2
-                             if num_tokens <= self.mc2_tokens_capacity else
-                             MoECommType.FUSED_ALLTOALL if quant_type
-                             == "w8a8_dynamic" else MoECommType.ALLTOALL)
+            # TODO: drop the EP-size guard when dispatch_ffn_combine supports larger EP sizes
+            moe_comm_type = (
+                MoECommType.MC2 if num_tokens <= self.mc2_tokens_capacity else
+                MoECommType.FUSED_ALLTOALL if quant_type == "w8a8_dynamic"
+                and get_ep_group().world_size <= 16 else MoECommType.ALLTOALL)
        else:
            raise ValueError(f"Unsupported soc_version: {soc_version}")