From ae068a334297efcb7cb54e51bfa2d7c3a00ddbd6 Mon Sep 17 00:00:00 2001
From: weijinqian0 <1184188277@qq.com>
Date: Mon, 24 Nov 2025 17:32:37 +0800
Subject: [PATCH] [Refactor] remove moe type of multicast. (#4224)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The main purposes of this PR are as follows:
1. Remove the multicast-related code;

Reason:
1. In the scenario like a2 Dual-System Back-to-Back Networking，the
performance is worse than all_gather. Before the modification, in e2e
test, it was 3 tps; after the modification, it is 10 tps.
2. At the same time, we usually enable the SP feature，it is consistent
with the current logic.
3. The advantage of broadcast communication lies in the fact that it
does not suffer from uneven DP load and does not require the prefill ACL
graph to be enabled. But we support prefill Acl graph recently.

So we think there is no need to maintain the multicast as one choice in
moe communication.

Performance benefits are as follows:
When not enable_flashcomm1, TTFT remains relatively stable at around
43000ms, which is approximately 15000ms faster than before the
modification.

When enable_flashcomm1, there is no diffenence, TTFT remains relatively
stable at around 29000ms.


- vLLM version: v0.11.0
- vLLM main:
https://github.com/vllm-project/vllm/commit/2918c1b49c88c29783c86f78d2c4221cb9622379

---------

Signed-off-by: weijinqian_v1 <weijinqian@huawei.com>
Signed-off-by: weijinqian0 <1184188277@qq.com>
Co-authored-by: weijinqian_v1 <weijinqian@huawei.com>
---
 tests/ut/ops/test_prepare_finalize.py         |  58 +--------
 .../torchair/ops/test_torchair_fused_moe.py   |   6 +-
 tests/ut/worker/test_model_runner_v1.py       |   4 +-
 vllm_ascend/ascend_forward_context.py         |  22 ++--
 vllm_ascend/ops/fused_moe/moe_comm_method.py  |  33 +----
 vllm_ascend/ops/fused_moe/prepare_finalize.py | 114 +-----------------
 vllm_ascend/spec_decode/eagle_proposer.py     |  12 +-
 vllm_ascend/spec_decode/mtp_proposer.py       |   6 +-
 vllm_ascend/torchair/torchair_mtp_proposer.py |   6 +-
 vllm_ascend/worker/model_runner_v1.py         |  18 +--
 10 files changed, 30 insertions(+), 249 deletions(-)

diff --git a/tests/ut/ops/test_prepare_finalize.py b/tests/ut/ops/test_prepare_finalize.py
index f0480e1c..35cb01a7 100644
--- a/tests/ut/ops/test_prepare_finalize.py
+++ b/tests/ut/ops/test_prepare_finalize.py
@@ -6,7 +6,7 @@ from vllm.model_executor.layers.fused_moe import FusedMoEConfig
 
 from vllm_ascend.ops.fused_moe.prepare_finalize import (
     PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
-    PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast)
+    PrepareAndFinalizeWithMC2)
 
 
 class TestPrepareAndFinalize(unittest.TestCase):
@@ -222,59 +222,3 @@ class TestPrepareAndFinalize(unittest.TestCase):
         mock_tp_all_reduce.return_value = result
         result_with_tp = layer.finalize(h_out, reduce_results=True)
         self.assertEqual(result_with_tp.shape[0], 3)
-
-    @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_dp_group")
-    @patch(
-        "vllm_ascend.ops.fused_moe.prepare_finalize.tensor_model_parallel_all_reduce"
-    )
-    @patch("vllm_ascend.ops.fused_moe.prepare_finalize.get_forward_context")
-    def test_naive_multicast_prepare_finalize(self, mock_get_forward_context,
-                                              mock_tp_all_reduce,
-                                              mock_get_dp_group):
-        # Mock forward context with DP metadata
-        mock_context = MagicMock()
-        mock_context.dp_metadata.cu_tokens_across_sp.return_value = torch.tensor(
-            [2, 5, 7])
-        mock_get_forward_context.return_value = mock_context
-
-        # Setup DP group mock
-        mock_dp_group = MagicMock()
-        mock_dp_group.broadcast = MagicMock()
-        mock_dp_group.all_reduce = MagicMock()
-        mock_get_dp_group.return_value = mock_dp_group
-
-        # Mock all_reduce to just return input (simulate sum)
-        def mock_all_reduce(tensor):
-            return tensor * 2
-
-        mock_dp_group.all_reduce.side_effect = mock_all_reduce
-
-        # Setup config
-        self.moe_config.dp_size = 3
-        self.moe_config.dp_rank = 1
-        self.moe_config.tp_size = 1
-        self.moe_config.ep_size = 1
-
-        layer = PrepareAndFinalizeWithNaiveMulticast(self.moe_config)
-
-        # Local inputs
-        hidden_states = torch.randn(3, 8)
-        router_logits = torch.randn(3, 2)
-
-        # Run prepare
-        h_out, r_out, _, _ = layer.prepare(hidden_states, router_logits)
-
-        # Should be global tensor: [7, 8] and [7, 2]
-        self.assertEqual(h_out.shape, (7, 8))
-        self.assertEqual(r_out.shape, (7, 2))
-
-        # Run finalize
-        result = layer.finalize(h_out, reduce_results=False)
-
-        # Should slice back to local: [3, 8]
-        self.assertEqual(result.shape, (3, 8))
-
-        # Test with reduce_results=True and TP/EP > 1
-        mock_tp_all_reduce.return_value = result
-        result_with_tp = layer.finalize(h_out, reduce_results=True)
-        self.assertEqual(result_with_tp.shape, (3, 8))
diff --git a/tests/ut/torchair/ops/test_torchair_fused_moe.py b/tests/ut/torchair/ops/test_torchair_fused_moe.py
index 57569a28..cf306d2a 100644
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -23,7 +23,7 @@ from pytest_mock import MockerFixture
 from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase
 
 import vllm_ascend
-from vllm_ascend.ascend_forward_context import _get_fused_moe_state
+from vllm_ascend.ascend_forward_context import get_fused_moe_state
 from vllm_ascend.quantization.quant_config import AscendFusedMoEMethod
 from vllm_ascend.torchair.ops.torchair_fused_moe import (
     TorchairAscendFusedMoE, TorchairAscendUnquantizedFusedMoEMethod)
@@ -360,7 +360,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
         global_redundant_expert_num = vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_config(
         ).init_redundancy_expert
         is_deepseek_v3_r1 = global_num_experts - global_redundant_expert_num == 256
-        forward_context = MagicMock(fused_moe_state=_get_fused_moe_state(
+        forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
             ep_size, is_prefill, is_deepseek_v3_r1))
         with patch(
                 "vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context",
@@ -396,7 +396,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
         ep_size = others_param
         is_prefill = False
         forward_context = MagicMock(
-            fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
+            fused_moe_state=get_fused_moe_state(ep_size, is_prefill, True))
         with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
              patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
             expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
diff --git a/tests/ut/worker/test_model_runner_v1.py b/tests/ut/worker/test_model_runner_v1.py
index b4eec741..1d781490 100644
--- a/tests/ut/worker/test_model_runner_v1.py
+++ b/tests/ut/worker/test_model_runner_v1.py
@@ -74,7 +74,7 @@ def test_select_moe_comm_method(soc_version, enable_expert_parallel,
 
         # Bind the real method to the mock object
         method = NPUModelRunner._select_moe_comm_method(
-            mock_runner, num_tokens, False)
+            mock_runner, num_tokens)
 
         # Assert the result
         assert method == expected_method
@@ -108,4 +108,4 @@ def test_select_moe_comm_method_unsupported_soc():
                   return_value=True), \
          pytest.raises(ValueError, match=f"Unsupported soc_version: {unsupported_soc}"):
 
-        NPUModelRunner._select_moe_comm_method(mock_runner, 100, False)
+        NPUModelRunner._select_moe_comm_method(mock_runner, 100)
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
index 8c477dac..11c1d3a0 100644
--- a/vllm_ascend/ascend_forward_context.py
+++ b/vllm_ascend/ascend_forward_context.py
@@ -29,16 +29,8 @@ class FusedMoEState(Enum):
     All2AllSeq = 5
 
 
-class MoECommType(Enum):
-    ALLGATHER = 0
-    MC2 = 1
-    ALLTOALL = 2
-    NAIVE_MULTICAST = 3
-
-
-# TODO(zzzzwwjj): add soc_version to choose branch
-def _get_fused_moe_state(ep_size: int, with_prefill: bool,
-                         is_deepseek_v3_r1: bool):
+def get_fused_moe_state(ep_size: int, with_prefill: bool,
+                        is_deepseek_v3_r1: bool):
     # the fusion operator torch_npu.npu_grouped_matmul_finalize_routing called by allgather ep
     # only supports deepseek v3/r1
     if (envs_ascend.VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP and ep_size > 1
@@ -56,6 +48,12 @@ def _get_fused_moe_state(ep_size: int, with_prefill: bool,
         return FusedMoEState.MC2
 
 
+class MoECommType(Enum):
+    ALLGATHER = 0
+    MC2 = 1
+    ALLTOALL = 2
+
+
 @contextmanager
 def set_ascend_forward_context(
         attn_metadata: Any,
@@ -103,8 +101,8 @@ def set_ascend_forward_context(
         is_deepseek_v3_r1 = hasattr(
             vllm_config.model_config.hf_config, 'n_routed_experts'
         ) and vllm_config.model_config.hf_config.n_routed_experts == 256
-        fused_moe_state = _get_fused_moe_state(ep_size, with_prefill,
-                                               is_deepseek_v3_r1)
+        fused_moe_state = get_fused_moe_state(ep_size, with_prefill,
+                                              is_deepseek_v3_r1)
         forward_context.fused_moe_state = fused_moe_state
         forward_context.in_profile_run = in_profile_run
 
diff --git a/vllm_ascend/ops/fused_moe/moe_comm_method.py b/vllm_ascend/ops/fused_moe/moe_comm_method.py
index c89eb1df..c48ce1a4 100644
--- a/vllm_ascend/ops/fused_moe/moe_comm_method.py
+++ b/vllm_ascend/ops/fused_moe/moe_comm_method.py
@@ -27,7 +27,7 @@ from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.ops.fused_moe.moe_mlp import unified_apply_mlp
 from vllm_ascend.ops.fused_moe.prepare_finalize import (
     PrepareAndFinalizeWithAll2All, PrepareAndFinalizeWithAllGather,
-    PrepareAndFinalizeWithMC2, PrepareAndFinalizeWithNaiveMulticast, QuantType)
+    PrepareAndFinalizeWithMC2, QuantType)
 from vllm_ascend.ops.fused_moe.token_dispatcher import (
     TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
     TokenDispatcherWithMC2, TokenDispatcherWithMoge)
@@ -44,8 +44,6 @@ def setup_moe_comm_method(moe_config):
     _MoECommMethods[MoECommType.ALLTOALL] = AlltoAllCommImpl(moe_config)
     _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl(moe_config)
     _MoECommMethods[MoECommType.MC2] = MC2CommImpl(moe_config)
-    _MoECommMethods[MoECommType.NAIVE_MULTICAST] = NaiveMulticastCommImpl(
-        moe_config)
 
 
 class MoECommMethod(ABC):
@@ -245,32 +243,3 @@ class AlltoAllCommImpl(MoECommMethod):
 
     def _get_prepare_finalize(self):
         return PrepareAndFinalizeWithAll2All(self.moe_config)
-
-
-class NaiveMulticastCommImpl(MoECommMethod):
-    """This implementation is the same as NativeAllGatherCommImpl,
-    but uses NPU-specific ops for better performance.
-
-    This implementation should be compatible with all scenarios, and
-    thus it is the default implementation for MoE communication methods.
-    It uses `torch_npu.npu_moe_init_routing_v2` for pre-processing
-    and `torch_npu.npu_moe_token_unpermute` for post-processing
-    to handle the token-to-expert mapping and communication efficiently.
-
-    NOTE(Yizhou): TBH, it is really weird that we were supposed to use
-    `torch_npu.npu_moe_init_routing_v2` and `torch_npu.npu_moe_finalize_routing`
-    or `torch_npu.npu_moe_token_permute` and `torch_npu.npu_moe_token_unpermute`
-    for pre-processing and post-processing, respectively.
-    But `npu_moe_finalize_routing` will lead to accuracy issues so we have to
-    use `torch_npu.npu_moe_token_unpermute` instead.
-    This is a workaround and should be removed after the issue is fixed.
-    """
-
-    def _get_token_dispatcher(self):
-        return TokenDispatcherWithAllGather(
-            top_k=self.moe_config.experts_per_token,
-            num_experts=self.moe_config.num_experts,
-            num_local_experts=self.moe_config.num_local_experts)
-
-    def _get_prepare_finalize(self):
-        return PrepareAndFinalizeWithNaiveMulticast(self.moe_config)
diff --git a/vllm_ascend/ops/fused_moe/prepare_finalize.py b/vllm_ascend/ops/fused_moe/prepare_finalize.py
index 46640006..48350ea8 100644
--- a/vllm_ascend/ops/fused_moe/prepare_finalize.py
+++ b/vllm_ascend/ops/fused_moe/prepare_finalize.py
@@ -45,7 +45,7 @@ class PrepareAndFinalize(ABC):
     """
     Abstract base class for MoE (Mixture-of-Experts) tensor preparation and finalization
     in distributed environments. Subclasses implement specific communication strategies
-    (e.g., AllGather, All2All, MC2, Naive Multicast) to handle tensor padding, slicing,
+    (e.g., AllGather, All2All, MC2) to handle tensor padding, slicing,
     broadcasting, and reduction across TP/DP/EP groups.
 
     Attributes:
@@ -454,115 +454,3 @@ class PrepareAndFinalizeWithAllGather(PrepareAndFinalize):
             hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
         return hidden_states
-
-
-class PrepareAndFinalizeWithNaiveMulticast(PrepareAndFinalize):
-    """
-    MoE communication strategy using Naive Multicast (point-to-point broadcast).
-    Will be used in prefill when using allgather in decode. Each DP rank broadcasts its slice to all others.
-    Uses `cu_tokens_across_dp_cpu` (cumulative tokens) to locate slice boundaries.
-    """
-
-    def _naive_multicast(self, x: torch.Tensor,
-                         cu_tokens_across_dp_cpu: torch.Tensor):
-        """
-        Naive multicast implementation:
-          1. Create global buffer sized by total tokens across DP.
-          2. Current rank copies its slice into its designated buffer region.
-          3. Each rank broadcasts its slice to all others via P2P.
-
-        Args:
-            x (torch.Tensor): Local tensor [local_tokens, hidden_size]
-            cu_tokens_across_dp_cpu (torch.Tensor): Cumulative token counts per DP rank
-
-        Returns:
-            torch.Tensor: Global tensor [total_tokens, hidden_size]
-        """
-        assert len(x.shape) == 2, "Input must be 2D [tokens, features]"
-        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
-                             device=x.device,
-                             dtype=x.dtype)
-
-        # Copy local slice into buffer
-        start = 0 if self.moe_config.dp_rank == 0 else cu_tokens_across_dp_cpu[
-            self.moe_config.dp_rank - 1]
-        end = cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
-        buffer[start:end, :].copy_(x)
-
-        # Broadcast each slice to all ranks
-        for idx in range(self.moe_config.dp_size):
-            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
-            end = cu_tokens_across_dp_cpu[idx]
-            get_dp_group().broadcast(buffer[start:end, :], idx)
-        return buffer
-
-    def prepare(
-        self,
-        hidden_states: torch.Tensor,
-        router_logits: torch.Tensor,
-        enable_shared_expert_dp: bool = False,
-        replace_allreduce: bool = False,
-        quant_type=QuantType.NONE
-    ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor],
-               Optional[torch.Tensor]]:
-        """
-        Preparation steps:
-          1. Fetch cumulative token boundaries from forward context.
-          2. Multicast hidden_states and router_logits to form global tensors.
-
-        Returns:
-            Tuple of (global_hidden_states, global_router_logits, None, None)
-        """
-        self.enable_shared_expert_dp = enable_shared_expert_dp
-
-        if self.moe_config.dp_size > 1:
-            self.cu_tokens_across_dp_cpu = get_forward_context(
-            ).dp_metadata.cu_tokens_across_sp(1)
-            hidden_states = self._naive_multicast(hidden_states,
-                                                  self.cu_tokens_across_dp_cpu)
-            router_logits = self._naive_multicast(router_logits,
-                                                  self.cu_tokens_across_dp_cpu)
-
-        if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
-            hidden_states = get_pcp_group().all_gather(
-                hidden_states,
-                dim=0,
-            )
-            router_logits = get_pcp_group().all_gather(
-                router_logits,
-                dim=0,
-            )
-
-        return hidden_states, router_logits, None, None
-
-    def finalize(self,
-                 hidden_states: torch.Tensor,
-                 reduce_results: bool,
-                 context_metadata: Optional[dict] = None) -> torch.Tensor:
-        """
-        Finalization steps:
-          1. If DP > 1 and not shared expert:
-               - All-reduce across DP
-               - Slice to current rank's token range using cu_tokens_across_dp_cpu
-          2. If `reduce_results=True` and TP/EP > 1, apply tensor_model_parallel_all_reduce.
-
-        Returns:
-            Tensor with shape [local_num_tokens, hidden_size]
-        """
-        if self.moe_config.dp_size > 1 and not self.enable_shared_expert_dp:
-            start = 0 if self.moe_config.dp_rank == 0 else self.cu_tokens_across_dp_cpu[
-                self.moe_config.dp_rank - 1]
-            end = self.cu_tokens_across_dp_cpu[self.moe_config.dp_rank]
-            hidden_states = get_dp_group().all_reduce(
-                hidden_states)  # Sum across DP
-            hidden_states = hidden_states[start:end, :]
-
-        if prefill_context_parallel_enable() and self.moe_config.pcp_size > 1:
-            hidden_states = get_pcp_group().reduce_scatter(hidden_states,
-                                                           dim=0)
-
-        if reduce_results and (self.moe_config.tp_size > 1
-                               or self.moe_config.ep_size > 1):
-            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
-
-        return hidden_states
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index d3be2ea9..4d076ac1 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -124,8 +124,7 @@ class EagleProposer(Proposer):
                   num_tokens_across_dp: Optional[torch.Tensor] = None,
                   aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                   batch_descriptor=None):
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
         with set_ascend_forward_context(None,
                                         self.vllm_config,
                                         moe_comm_type=moe_comm_type,
@@ -460,11 +459,7 @@ class EagleProposer(Proposer):
         else:
             num_input_tokens = num_tokens
 
-        with_prefill = attn_metadata.attn_state not in [
-            AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
-        ]
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_input_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
 
         # copy inputs to buffer for cudagraph
         self.positions[:num_tokens] = target_positions.to(device)
@@ -504,8 +499,7 @@ class EagleProposer(Proposer):
         else:
             input_batch_size = batch_size
 
-        moe_comm_type = self.runner._select_moe_comm_method(
-            input_batch_size, False)
+        moe_comm_type = self.runner._select_moe_comm_method(input_batch_size)
 
         attn_metadata.num_actual_tokens = batch_size
         attn_metadata.max_query_len = 1
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
index 7aa9b729..556a917f 100644
--- a/vllm_ascend/spec_decode/mtp_proposer.py
+++ b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -223,8 +223,7 @@ class MtpProposer(Proposer):
             with_prefill,
         ) = self.runner._sync_metadata_across_dp(num_tokens, with_prefill)
 
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
 
         if skip_attn:
             attn_metadata = None
@@ -672,8 +671,7 @@ class MtpProposer(Proposer):
          with_prefill) = self.runner._sync_metadata_across_dp(
              num_input_tokens, self.runner.with_prefill)
 
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_input_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
 
         if scheduler_output:
             max_query_len = common_attn_metadata.max_query_len
diff --git a/vllm_ascend/torchair/torchair_mtp_proposer.py b/vllm_ascend/torchair/torchair_mtp_proposer.py
index 183e0da2..b816b8d8 100644
--- a/vllm_ascend/torchair/torchair_mtp_proposer.py
+++ b/vllm_ascend/torchair/torchair_mtp_proposer.py
@@ -81,8 +81,7 @@ class TorchairMtpProposer(MtpProposer):
                   num_tokens_across_dp=None,
                   aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
                   batch_descriptor=None) -> None:
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
 
         if not with_prefill:
             skip_attn = False
@@ -342,8 +341,7 @@ class TorchairMtpProposer(MtpProposer):
         num_tokens_across_dp = self.runner.num_tokens_across_dp
         with_prefill = self.runner.with_prefill
 
-        moe_comm_type = self.runner._select_moe_comm_method(
-            num_input_tokens, with_prefill)
+        moe_comm_type = self.runner._select_moe_comm_method(num_input_tokens)
         batch_descriptor = BatchDescriptor(num_tokens=num_input_tokens,
                                            uniform_decode=False)
         aclgraph_runtime_mode, batch_descriptor = \
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 124102f5..8f103ddc 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2192,8 +2192,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             kv_connector_output=kv_connector_output,
         )
 
-    def _select_moe_comm_method(self, num_tokens: int,
-                                with_prefill: bool) -> Optional[MoECommType]:
+    def _select_moe_comm_method(self,
+                                num_tokens: int) -> Optional[MoECommType]:
         """1. If expert parallel is not enabled, we use all-gather since MC2 and all-to-all
         are designed for expert parallelism.
         2. If expert parallel is enabled, we need to consider the soc version and the
@@ -2244,12 +2244,6 @@ class NPUModelRunner(LoRAModelRunnerMixin):
         else:
             raise ValueError(f"Unsupported soc_version: {soc_version}")
 
-        if moe_comm_type == MoECommType.ALLGATHER and with_prefill:
-            if enable_sp():
-                moe_comm_type = MoECommType.ALLGATHER
-            else:
-                moe_comm_type = MoECommType.NAIVE_MULTICAST
-
         # PanguProMoE only supports allgather
         if model_type == "PanguProMoE":
             moe_comm_type = MoECommType.ALLGATHER
@@ -2289,8 +2283,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             if self.dynamic_eplb:
                 self.eplb_updator.take_update_info_from_eplb_process()
 
-        moe_comm_type = self._select_moe_comm_method(num_input_tokens,
-                                                     self.with_prefill)
+        moe_comm_type = self._select_moe_comm_method(num_input_tokens)
 
         uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
             scheduler_output.total_num_scheduled_tokens
@@ -2823,7 +2816,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
          with_prefill) = self._sync_metadata_across_dp(num_tokens,
                                                        with_prefill)
 
-        moe_comm_type = self._select_moe_comm_method(num_tokens, with_prefill)
+        moe_comm_type = self._select_moe_comm_method(num_tokens)
 
         # If cudagraph_mode.decode_mode() == FULL and
         # cudagraph_mode.seperate_routine(). This means that we are using
@@ -2999,8 +2992,7 @@ class NPUModelRunner(LoRAModelRunnerMixin):
             # allowing vLLM to correctly estimate the maximum memory required.
             if self.max_num_tokens > self.mc2_tokens_capacity and \
                 self._select_moe_comm_method(
-                    self.mc2_tokens_capacity,
-                    with_prefill=True) == MoECommType.MC2:
+                    self.mc2_tokens_capacity) == MoECommType.MC2:
                 self._dummy_run(self.mc2_tokens_capacity, with_prefill=True)
 
         output = None