[bugfix] Use FUSED_MC2 MoE comm path for the op dispatch_ffn_combine (#5156)

### What this PR does / why we need it? - Renames the MoE comm enum value `MoECommType.FUSED_ALLTOALL` to `MoECommType.FUSED_MC2` and updates all call sites. - Updates `select_moe_comm_method` to optionally select `FUSED_MC2` on Ascend A3 when: - `enable_expert_parallel=True` - quantization is `w8a8_dynamic` - `EP <= 16` - `dynamic_eplb` is disabled - `is_mtp_model = False` - Replaces the old “fused all-to-all” comm implementation with `FusedMC2CommImpl`, using `TokenDispatcherWithMC2` / `PrepareAndFinalizeWithMC2` and `dispatch_ffn_combine`. - vLLM version: v0.12.0 - vLLM main: ad32e3e19c --------- Signed-off-by: Chen Chen <0109chenchen@gmail.com>
2025-12-18 23:34:31 +08:00
parent 73e4b4f496
commit 1b47fca0e8
7 changed files with 89 additions and 75 deletions
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -430,7 +430,8 @@ class NPUModelRunner(GPUModelRunner):
        # moe_comm_method of each rank is MC2 and recomputation would never happen in D
        # nodes. So here we check whether recompute_scheduler_enable is True.
        return self.is_kv_consumer and self.ascend_config.recompute_scheduler_enable and select_moe_comm_method(
-            potential_max_num_tokens, self.vllm_config) == MoECommType.MC2
+            potential_max_num_tokens,
+            self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}

    def _sync_metadata_across_dp(
            self, num_tokens: int,
@@ -1058,7 +1059,7 @@ class NPUModelRunner(GPUModelRunner):
                # (num_reqs_d + num_reqs_p, max_num_blocks),
                # flattened block_table: [d0, d0, d1, d1, p0, p1, p2]
                # (num_reqs_d * decode_threshold + num_reqs_p, max_num_blocks),
-                ori_query_lens = self.query_start_loc_pcp_full.cpu[1:num_reqs+1] - \
+                ori_query_lens = self.query_start_loc_pcp_full.cpu[1:num_reqs + 1] - \
                    self.query_start_loc_pcp_full.cpu[:num_reqs]
                num_prefill_reqs = (ori_query_lens
                                    > self.decode_threshold).sum().item()
@@ -2203,7 +2204,7 @@ class NPUModelRunner(GPUModelRunner):
    def profile_run(self) -> None:
        mc2_tokens_capacity = get_mc2_tokens_capacity()
        if self.max_num_tokens > mc2_tokens_capacity and \
-            select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) == MoECommType.MC2:
+            select_moe_comm_method(mc2_tokens_capacity, self.vllm_config) in {MoECommType.MC2, MoECommType.FUSED_MC2}:
            self._dummy_run(mc2_tokens_capacity,
                            with_prefill=True,
                            is_profile=True)