[Refactor] remove moe type of multicast. (#4224)

The main purposes of this PR are as follows: 1. Remove the multicast-related code; Reason: 1. In the scenario like a2 Dual-System Back-to-Back Networking，the performance is worse than all_gather. Before the modification, in e2e test, it was 3 tps; after the modification, it is 10 tps. 2. At the same time, we usually enable the SP feature，it is consistent with the current logic. 3. The advantage of broadcast communication lies in the fact that it does not suffer from uneven DP load and does not require the prefill ACL graph to be enabled. But we support prefill Acl graph recently. So we think there is no need to maintain the multicast as one choice in moe communication. Performance benefits are as follows: When not enable_flashcomm1, TTFT remains relatively stable at around 43000ms, which is approximately 15000ms faster than before the modification. When enable_flashcomm1, there is no diffenence, TTFT remains relatively stable at around 29000ms. - vLLM version: v0.11.0 - vLLM main: 2918c1b49c --------- Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Signed-off-by: weijinqian0 <1184188277@qq.com> Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
2025-11-24 17:32:37 +08:00
parent 5508a602ed
commit ae068a3342
10 changed files with 30 additions and 249 deletions
--- a/tests/ut/ops/test_prepare_finalize.py
+++ b/tests/ut/ops/test_prepare_finalize.py
@@ -6,7 +6,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoEConfig
 from vllm_ascend.ops.fused_moe.prepare_finalize import (
    PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
-    PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast)
+    PrepareAndFinalizeWithMC2)
 class TestPrepareAndFinalize(unittest.TestCase):
@@ -222,59 +222,3 @@ class TestPrepareAndFinalize(unittest.TestCase):
        mock_tp_all_reduce.return_value = result
        result_with_tp = layer.finalize(h_out, reduce_results=True)
        self.assertEqual(result_with_tp.shape[0], 3)
    @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_dp_group")
    @patch(
        "vllm_ascend.ops.fused_moe.prepare_finalize.tensor_model_parallel_all_reduce"
    )
    @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context")
    def test_naive_multicast_prepare_finalize(self, mock_get_forward_context,
                                              mock_tp_all_reduce,
                                              mock_get_dp_group):
        # Mock forward context with DP metadata
        mock_context = MagicMock()
        mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor(
            [2, 5, 7])
        mock_get_forward_context.return_value = mock_context
        # Setup DP group mock
        mock_dp_group = MagicMock()
        mock_dp_group.broadcast = MagicMock()
        mock_dp_group.all_reduce = MagicMock()
        mock_get_dp_group.return_value = mock_dp_group
        # Mock all_reduce to just return input (simulate sum)
        def mock_all_reduce(tensor):
            return tensor * 2
        mock_dp_group.all_reduce.side_effect = mock_all_reduce
        # Setup config
        self.moe_config.dp_size = 3
        self.moe_config.dp_rank = 1
        self.moe_config.tp_size = 1
        self.moe_config.ep_size = 1
        layer = PrepareAndFinalizeWithNaiveMulticast(self.moe_config)
        # Local inputs
        hidden_states = torch.randn(3, 8)
        router_logits = torch.randn(3, 2)
        # Run prepare
        h_out, r_out, _, _ = layer.prepare(hidden_states, router_logits)
        # Should be global tensor: [7, 8] and [7, 2]
        self.assertEqual(h_out.shape, (7, 8))
        self.assertEqual(r_out.shape, (7, 2))
        # Run finalize
        result = layer.finalize(h_out, reduce_results=False)
        # Should slice back to local: [3, 8]
        self.assertEqual(result.shape, (3, 8))
        # Test with reduce_results=True and TP/EP > 1
        mock_tp_all_reduce.return_value = result
        result_with_tp = layer.finalize(h_out, reduce_results=True)
        self.assertEqual(result_with_tp.shape, (3, 8))
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -23,7 +23,7 @@ from pytest_mock import MockerFixture
 from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
 import vllm_ascend
-from vllm_ascend.ascend_forward_context import _get_fused_moe_state
+from vllm_ascend.ascend_forward_context import get_fused_moe_state
 from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
 from vllm_ascend.torchair.ops.torchair_fused_moe import (
    TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
@@ -360,7 +360,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
        global_redundant_expert_num = vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config(
        ).init_redundancy_expert
        is_deepseek_v3_r1 = global_num_experts - global_redundant_expert_num == 256
-        forward_context = MagicMock(fused_moe_state=_get_fused_moe_state(
+        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
            ep_size, is_prefill, is_deepseek_v3_r1))
        with patch(
                "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context",
@@ -396,7 +396,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
        ep_size = others_param
        is_prefill = False
        forward_context = MagicMock(
-            fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
+            fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
            expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
--- a/tests/ut/worker/test_model_runner_v1.py
+++ b/tests/ut/worker/test_model_runner_v1.py
@@ -74,7 +74,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel,
        # Bind the real method to the mock object
        method = NPUModelRunner._select_moe_comm_method(
-            mock_runner, num_tokens, False)
+            mock_runner, num_tokens)
        # Assert the result
        assert method == expected_method
@@ -108,4 +108,4 @@ def test_select_moe_comm_method_unsupported_soc():
                  return_value=True), \
         pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
-        NPUModelRunner._select_moe_comm_method(mock_runner, 100, False)
+        NPUModelRunner._select_moe_comm_method(mock_runner, 100)
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -29,16 +29,8 @@ class FusedMoEState(Enum):
    All2AllSeq = 5
-class MoECommType(Enum):
+def get_fused_moe_state(ep_size: int, with_prefill: bool,
-    ALLGATHER = 0
+                        is_deepseek_v3_r1: bool):
    MC2 = 1
    ALLTOALL = 2
    NAIVE_MULTICAST = 3
 # TODO(zzzzwwjj): add soc_version to choose branch
 def _get_fused_moe_state(ep_size: int, with_prefill: bool,
                         is_deepseek_v3_r1: bool):
    # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
    # only supports deepseek v3/r1
    if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
@@ -56,6 +48,12 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
        return FusedMoEState.MC2
 class MoECommType(Enum):
    ALLGATHER = 0
    MC2 = 1
    ALLTOALL = 2
@contextmanager
 def set_ascend_forward_context(
        attn_metadata: Any,
@@ -103,8 +101,8 @@ def set_ascend_forward_context(
        is_deepseek_v3_r1 = hasattr(
            vllm_config.model_config.hf_config, 'n_routed_experts'
        ) and vllm_config.model_config.hf_config.n_routed_experts == 256
-        fused_moe_state = _get_fused_moe_state(ep_size, with_prefill,
+        fused_moe_state = get_fused_moe_state(ep_size, with_prefill,
-                                               is_deepseek_v3_r1)
+                                              is_deepseek_v3_r1)
        forward_context.fused_moe_state = fused_moe_state
        forward_context.in_profile_run = in_profile_run
--- a/vllm_ascend/ops/fused_moe/moe_comm_method.py
+++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py
@@ -27,7 +27,7 @@ from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.fused_moe.prepare_finalize import (
    PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
-    PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast, QuantType)
+    PrepareAndFinalizeWithMC2, QuantType)
 from vllm_ascend.ops.fused_moe.token_dispatcher import (
    TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
    TokenDispatcherWithMC2, TokenDispatcherWithMoge)
@@ -44,8 +44,6 @@ def setup_moe_comm_method(moe_config):
    _MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config)
    _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config)
    _MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config)
    _MoECommMethods[MoECommType.NAIVE_MULTICAST] = NaiveMulticastCommImpl(
        moe_config)
 class MoECommMethod(ABC):
@@ -245,32 +243,3 @@ class AlltoAllCommImpl(MoECommMethod):
    def _get_prepare_finalize(self):
        return PrepareAndFinalizeWithAll2All(self.moe_config)
 class NaiveMulticastCommImpl(MoECommMethod):
    """This implementation is the same as NativeAllGatherCommImpl,
    but uses NPU-specific ops for better performance.
    This implementation should be compatible with all scenarios, and
    thus it is the default implementation for MoE communication methods.
    It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing
    and `torch_npu.npu_moe_token_unpermute` for post-processing
    to handle the token-to-expert mapping and communication efficiently.
    NOTE(Yizhou): TBH, it is really weird that we were supposed to use
    `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing`
    or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute`
    for pre-processing and post-processing, respectively.
    But `npu_moe_finalize_routing` will lead to accuracy issues so we have to
    use `torch_npu.npu_moe_token_unpermute` instead.
    This is a workaround and should be removed after the issue is fixed.
    """
    def _get_token_dispatcher(self):
        return TokenDispatcherWithAllGather(
            top_k=self.moe_config.experts_per_token,
            num_experts=self.moe_config.num_experts,
            num_local_experts=self.moe_config.num_local_experts)
    def _get_prepare_finalize(self):
        return PrepareAndFinalizeWithNaiveMulticast(self.moe_config)
--- a/vllm_ascend/ops/fused_moe/prepare_finalize.py
+++ b/vllm_ascend/ops/fused_moe/prepare_finalize.py
@@ -45,7 +45,7 @@ class PrepareAndFinalize(ABC):
    """
    Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization
    in distributed environments. Subclasses implement specific communication strategies
-    (e.g., AllGather, All2All, MC2, Naive Multicast) to handle tensor padding, slicing,
+    (e.g., AllGather, All2All, MC2) to handle tensor padding, slicing,
    broadcasting, and reduction across TP/DP/EP groups.
    Attributes:
@@ -454,115 +454,3 @@ class PrepareAndFinalizeWithAllGather(PrepareAndFinalize):
            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
        return hidden_states
 class PrepareAndFinalizeWithNaiveMulticast(PrepareAndFinalize):
    """
    MoE communication strategy using Naive Multicast (point-to-point broadcast).
    Will be used in prefill when using allgather in decode. Each DP rank broadcasts its slice to all others.
    Uses `cu_tokens_across_dp_cpu` (cumulative tokens) to locate slice boundaries.
    """
    def _naive_multicast(self, x: torch.Tensor,
                         cu_tokens_across_dp_cpu: torch.Tensor):
        """
        Naive multicast implementation:
          1. Create global buffer sized by total tokens across DP.
          2. Current rank copies its slice into its designated buffer region.
          3. Each rank broadcasts its slice to all others via P2P.
        Args:
            x (torch.Tensor): Local tensor [local_tokens, hidden_size]
            cu_tokens_across_dp_cpu (torch.Tensor): Cumulative token counts per DP rank
        Returns:
            torch.Tensor: Global tensor [total_tokens, hidden_size]
        """
        assert len(x.shape) == 2, "Input must be 2D [tokens, features]"
        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
                             device=x.device,
                             dtype=x.dtype)
        # Copy local slice into buffer
        start = 0 if self.moe_config.dp_rank == 0 else cu_tokens_across_dp_cpu[
            self.moe_config.dp_rank - 1]
        end = cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
        buffer[start:end, :].copy_(x)
        # Broadcast each slice to all ranks
        for idx in range(self.moe_config.dp_size):
            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
            end = cu_tokens_across_dp_cpu[idx]
            get_dp_group().broadcast(buffer[start:end, :], idx)
        return buffer
    def prepare(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        enable_shared_expert_dp: bool = False,
        replace_allreduce: bool = False,
        quant_type=QuantType.NONE
    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
               Optional[torch.Tensor]]:
        """
        Preparation steps:
          1. Fetch cumulative token boundaries from forward context.
          2. Multicast hidden_states and router_logits to form global tensors.
        Returns:
            Tuple of (global_hidden_states, global_router_logits, None, None)
        """
        self.enable_shared_expert_dp = enable_shared_expert_dp
        if self.moe_config.dp_size > 1:
            self.cu_tokens_across_dp_cpu = get_forward_context(
            ).dp_metadata.cu_tokens_across_sp(1)
            hidden_states = self._naive_multicast(hidden_states,
                                                  self.cu_tokens_across_dp_cpu)
            router_logits = self._naive_multicast(router_logits,
                                                  self.cu_tokens_across_dp_cpu)
        if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
            hidden_states = get_pcp_group().all_gather(
                hidden_states,
                dim=0,
            )
            router_logits = get_pcp_group().all_gather(
                router_logits,
                dim=0,
            )
        return hidden_states, router_logits, None, None
    def finalize(self,
                 hidden_states: torch.Tensor,
                 reduce_results: bool,
                 context_metadata: Optional[dict] = None) -> torch.Tensor:
        """
        Finalization steps:
          1. If DP > 1 and not shared expert:
               - All-reduce across DP
               - Slice to current rank's token range using cu_tokens_across_dp_cpu
          2. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce.
        Returns:
            Tensor with shape [local_num_tokens, hidden_size]
        """
        if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp:
            start = 0 if self.moe_config.dp_rank == 0 else self.cu_tokens_across_dp_cpu[
                self.moe_config.dp_rank - 1]
            end = self.cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
            hidden_states = get_dp_group().all_reduce(
                hidden_states)  # Sum across DP
            hidden_states = hidden_states[start:end, :]
        if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
            hidden_states = get_pcp_group().reduce_scatter(hidden_states,
                                                           dim=0)
        if reduce_results and (self.moe_config.tp_size > 1
                               or self.moe_config.ep_size > 1):
            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
        return hidden_states
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -124,8 +124,7 @@ class EagleProposer(Proposer):
                  num_tokens_across_dp: Optional[torch.Tensor] = None,
                  aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                  batch_descriptor=None):
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
            num_tokens, with_prefill)
        with set_ascend_forward_context(None,
                                        self.vllm_config,
                                        moe_comm_type=moe_comm_type,
@@ -460,11 +459,7 @@ class EagleProposer(Proposer):
        else:
            num_input_tokens = num_tokens
-        with_prefill = attn_metadata.attn_state not in [
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
        ]
        moe_comm_type = self.runner._select_moe_comm_method(
            num_input_tokens, with_prefill)
        # copy inputs to buffer for cudagraph
        self.positions[:num_tokens] = target_positions.to(device)
@@ -504,8 +499,7 @@ class EagleProposer(Proposer):
        else:
            input_batch_size = batch_size
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(input_batch_size)
            input_batch_size, False)
        attn_metadata.num_actual_tokens = batch_size
        attn_metadata.max_query_len = 1
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -223,8 +223,7 @@ class MtpProposer(Proposer):
            with_prefill,
        ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
            num_tokens, with_prefill)
        if skip_attn:
            attn_metadata = None
@@ -672,8 +671,7 @@ class MtpProposer(Proposer):
         with_prefill) = self.runner._sync_metadata_across_dp(
             num_input_tokens, self.runner.with_prefill)
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
            num_input_tokens, with_prefill)
        if scheduler_output:
            max_query_len = common_attn_metadata.max_query_len
--- a/vllm_ascend/torchair/torchair_mtp_proposer.py
+++ b/vllm_ascend/torchair/torchair_mtp_proposer.py
@@ -81,8 +81,7 @@ class TorchairMtpProposer(MtpProposer):
                  num_tokens_across_dp=None,
                  aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                  batch_descriptor=None) -> None:
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
            num_tokens, with_prefill)
        if not with_prefill:
            skip_attn = False
@@ -342,8 +341,7 @@ class TorchairMtpProposer(MtpProposer):
        num_tokens_across_dp = self.runner.num_tokens_across_dp
        with_prefill = self.runner.with_prefill
-        moe_comm_type = self.runner._select_moe_comm_method(
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
            num_input_tokens, with_prefill)
        batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
                                           uniform_decode=False)
        aclgraph_runtime_mode, batch_descriptor = \
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2192,8 +2192,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            kv_connector_output=kv_connector_output,
        )
-    def _select_moe_comm_method(self, num_tokens: int,
+    def _select_moe_comm_method(self,
-                                with_prefill: bool) -> Optional[MoECommType]:
+                                num_tokens: int) -> Optional[MoECommType]:
        """1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
        are designed for expert parallelism.
        2. If expert parallel is enabled, we need to consider the soc version and the
@@ -2244,12 +2244,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
        else:
            raise ValueError(f"Unsupported soc_version: {soc_version}")
        if moe_comm_type == MoECommType.ALLGATHER and with_prefill:
            if enable_sp():
                moe_comm_type = MoECommType.ALLGATHER
            else:
                moe_comm_type = MoECommType.NAIVE_MULTICAST
        # PanguProMoE only supports allgather
        if model_type == "PanguProMoE":
            moe_comm_type = MoECommType.ALLGATHER
@@ -2289,8 +2283,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            if self.dynamic_eplb:
                self.eplb_updator.take_update_info_from_eplb_process()
-        moe_comm_type = self._select_moe_comm_method(num_input_tokens,
+        moe_comm_type = self._select_moe_comm_method(num_input_tokens)
                                                     self.with_prefill)
        uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
            scheduler_output.total_num_scheduled_tokens
@@ -2823,7 +2816,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         with_prefill) = self._sync_metadata_across_dp(num_tokens,
                                                       with_prefill)
-        moe_comm_type = self._select_moe_comm_method(num_tokens, with_prefill)
+        moe_comm_type = self._select_moe_comm_method(num_tokens)
        # If cudagraph_mode.decode_mode() == FULL and
        # cudagraph_mode.seperate_routine(). This means that we are using
@@ -2999,8 +2992,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
            # allowing vLLM to correctly estimate the maximum memory required.
            if self.max_num_tokens > self.mc2_tokens_capacity and \
                self._select_moe_comm_method(
-                    self.mc2_tokens_capacity,
+                    self.mc2_tokens_capacity) == MoECommType.MC2:
                    with_prefill=True) == MoECommType.MC2:
                self._dummy_run(self.mc2_tokens_capacity, with_prefill=True)
        output = None