[Main] [Refactor] Enable MoECommMethod in Eager Mode (#2791)

### What this PR does / why we need it? 1. Replace prepare/finalize operation in fused_moe.py by moe_comm_method.prepare()/finalize() 2. Replace unified_fused_experts by moe_comm_method.fused_experts() in fused_moe.py/w8a8_dynamic.py/w4a8_dynamic.py 3. Add calling _select_moe_comm_method in spec-decode proposers. 4. Currently, w4a8_dynamic does not support gatherep, use all2allv instead. 5. Remove redundant code. ### Does this PR introduce _any_ user-facing change? AllgatherEP switch is disabled in aclgraph/eager mode, just follow the rules in modelrunner_v1._select_moe_comm_method() ### How was this patch tested? e2e & ut - vLLM version: v0.10.2 - vLLM main: 7f6f2c1182 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-09-16 11:06:00 +08:00
parent 0aba644633
commit 18ca7861f6
18 changed files with 523 additions and 596 deletions
--- a/tests/ut/ops/test_fused_moe_prepare_and_finalize.py
+++ b/tests/ut/ops/test_fused_moe_prepare_and_finalize.py
@@ -6,7 +6,8 @@ from vllm.model_executor.layers.fused_moe import FusedMoEConfig

 from vllm_ascend.ops.moe.fused_moe_prepare_and_finalize import (
    FusedMoEPrepareAndFinalizeWithAll2All,
-    FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2)
+    FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2,
+    FusedMoEPrepareAndFinalizeWithNaiveMulticast)


 class TestFusedMoEPrepareAndFinalize(unittest.TestCase):
@@ -216,3 +217,68 @@ class TestFusedMoEPrepareAndFinalize(unittest.TestCase):
        mock_tp_all_reduce.return_value = result
        result_with_tp = layer.finalize(h_out, reduce_results=True)
        self.assertEqual(result_with_tp.shape[0], 3)
+
+    @patch("vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_dp_group")
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.tensor_model_parallel_all_reduce"
+    )
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_forward_context"
+    )
+    def test_naive_multicast_prepare_finalize(self, mock_get_forward_context,
+                                              mock_tp_all_reduce,
+                                              mock_get_dp_group):
+        # Mock forward context with DP metadata
+        mock_context = MagicMock()
+        mock_context.dp_metadata.cu_tokens_across_dp_cpu = torch.tensor(
+            [2, 5, 7])
+        mock_get_forward_context.return_value = mock_context
+
+        # Setup DP group mock
+        mock_dp_group = MagicMock()
+        mock_dp_group.broadcast = MagicMock()
+        mock_dp_group.all_reduce = MagicMock()
+        mock_get_dp_group.return_value = mock_dp_group
+
+        # Mock all_reduce to just return input (simulate sum)
+        def mock_all_reduce(tensor):
+            return tensor * 2
+
+        mock_dp_group.all_reduce.side_effect = mock_all_reduce
+
+        # Setup config
+        self.moe_config.dp_size = 3
+        self.moe_config.dp_rank = 1
+        self.moe_config.tp_size = 1
+        self.moe_config.ep_size = 1
+
+        layer = FusedMoEPrepareAndFinalizeWithNaiveMulticast(self.moe_config)
+
+        # Local inputs
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+
+        # Mock gate for router logits recomputation
+        mock_gate = MagicMock()
+        mock_gate.return_value = (torch.randn(7, 2), None)
+
+        # Run prepare
+        h_out, r_out, _ = layer.prepare(hidden_states,
+                                        router_logits,
+                                        rm_router_logits=False,
+                                        gate=mock_gate)
+
+        # Should be global tensor: [7, 8] and [7, 2]
+        self.assertEqual(h_out.shape, (7, 8))
+        self.assertEqual(r_out.shape, (7, 2))
+
+        # Run finalize
+        result = layer.finalize(h_out, reduce_results=False)
+
+        # Should slice back to local: [3, 8]
+        self.assertEqual(result.shape, (3, 8))
+
+        # Test with reduce_results=True and TP/EP > 1
+        mock_tp_all_reduce.return_value = result
+        result_with_tp = layer.finalize(h_out, reduce_results=True)
+        self.assertEqual(result_with_tp.shape, (3, 8))