[Refactor][MoE] remove redundant code after refactoring fused_moe (#2612)

### What this PR does / why we need it? There are a lot of redundant codes related to moe here, and the structure is not very clear. We did the following things： we have placed the relatively independent code related to apply_mlp into a separate file; removed the environment variables of alltoall_buffer and alltoall_seq. Remove the code related to alltoall_buffer and alltoall_seq, and retain the sole TokenDispatcher inheritance class. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? e2e&ut - vLLM version: v0.10.1.1 - vLLM main: 4071c76cf3 --------- Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Signed-off-by: weijinqian_v1 <weijinqian@huawei.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-08-30 22:28:50 +08:00
parent 20ae71291d
commit 3a5fc5ee01
13 changed files with 417 additions and 1237 deletions
--- a/tests/e2e/multicard/test_offline_inference_distributed.py
+++ b/tests/e2e/multicard/test_offline_inference_distributed.py
@@ -108,14 +108,13 @@ def test_models_distributed_pangu():
    ]
    max_tokens = 5

-    with VllmRunner(
-            snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
-            max_model_len=8192,
-            enforce_eager=True,
-            dtype="auto",
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
+    with VllmRunner(snapshot_download("vllm-ascend/pangu-pro-moe-pruing"),
+                    max_model_len=8192,
+                    enforce_eager=True,
+                    dtype="auto",
+                    tensor_parallel_size=2,
+                    distributed_executor_backend="mp",
+                    enable_expert_parallel=True) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)


@@ -141,28 +140,6 @@ def test_models_distributed_topk() -> None:
        vllm_model.generate(example_prompts, sampling_params)


-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ": "1"})
-def test_models_distributed_alltoallv() -> None:
-    example_prompts = [
-        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
-        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
-        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
-    ]
-    dtype = "half"
-    sampling_params = SamplingParams(max_tokens=5,
-                                     temperature=0.0,
-                                     top_k=50,
-                                     top_p=0.9)
-
-    with VllmRunner(
-            "deepseek-ai/DeepSeek-V2-Lite",
-            dtype=dtype,
-            tensor_parallel_size=2,
-            distributed_executor_backend="mp",
-    ) as vllm_model:
-        vllm_model.generate(example_prompts, sampling_params)
-
-
 def test_models_distributed_Qwen3_W8A8():
    example_prompts = [
        "Hello, my name is",
--- a/tests/ut/ops/test_common_fused_moe.py
+++ b/tests/ut/ops/test_common_fused_moe.py
@@ -0,0 +1,69 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+from unittest.mock import patch
+
+import torch
+
+from tests.ut.base import TestBase
+from vllm_ascend.ops.common_fused_moe import fused_experts_moge
+
+
+class TestFusedExpertsMoGE(TestBase):
+
+    def test_fused_experts_moge(self):
+        with patch('torch_npu.npu_grouped_matmul') as mock_grouped_matmul, \
+             patch('torch_npu.npu_swiglu') as mock_swiglu, \
+             patch('vllm_ascend.utils.is_310p') as mock_is_310p:
+
+            mock_is_310p.return_value = False
+
+            mock_grouped_matmul.side_effect = lambda x, weight, **kwargs: [
+                torch.randn(x[0].shape[0], weight[0].shape[1])
+            ]
+
+            mock_swiglu.side_effect = lambda x: x
+
+            hidden_states = torch.randn(4, 128)
+            w1 = torch.randn(4, 256, 128)
+            w2 = torch.randn(4, 128, 128)
+            topk_weights = torch.rand(4, 1)
+            topk_ids = torch.tensor([[0], [1], [2], [3]], dtype=torch.long)
+            top_k = 1
+            global_num_experts = 4
+
+            moe_parallel_config = type(
+                'MockConfig', (), {
+                    'ep_size': 1,
+                    'tp_size': 1,
+                    'dp_size': 1,
+                    'tp_rank': 0,
+                    'dp_rank': 0,
+                    'ep_rank': 0,
+                    'use_ep': True
+                })()
+
+            output = fused_experts_moge(
+                hidden_states=hidden_states,
+                w1=w1,
+                w2=w2,
+                moe_parallel_config=moe_parallel_config,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                global_num_experts=global_num_experts,
+                apply_router_weight_on_input=True,
+            )
+
+            self.assertEqual(output.shape, (4, 128))
--- a/tests/ut/ops/test_fused_ops.py
+++ b/tests/ut/ops/test_fused_ops.py
@@ -27,9 +27,9 @@ from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import (FusedMoEState,
                                                _get_fused_moe_state)
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
-                                       AscendUnquantizedFusedMoEMethod,
-                                       unified_apply_mlp)
+                                       AscendUnquantizedFusedMoEMethod)
 from vllm_ascend.ops.layers.experts_selector import select_experts
+from vllm_ascend.ops.layers.moe_mlp import unified_apply_mlp
 from vllm_ascend.utils import AscendSocVersion, adapt_patch

 adapt_patch(True)
@@ -129,36 +129,38 @@ def mock_dist_env(mocker: MockerFixture):
        with_quant=False)

    with patch('torch.distributed.get_rank', return_value=0), \
-         patch('torch.distributed.get_world_size', return_value=4), \
-         patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
-         patch('torch.distributed.all_gather'), \
-         patch('torch.distributed.all_to_all_single'), \
-         patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce'), \
-         patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter'), \
-         patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
-               return_value=mock_dp_and_tp_group(mocker)), \
-         patch('vllm_ascend.ops.fused_moe.get_ascend_config',
-               return_value=MagicMock(
-                   torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
-                   expert_map_path=None
-               )), \
-         patch('vllm_ascend.ops.fused_moe.determine_expert_map',
-               return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
-         patch('vllm_ascend.ops.fused_moe.get_forward_context',
-               return_value=mock_forward_context_obj), \
+        patch('torch.distributed.get_world_size', return_value=4), \
+        patch('vllm_ascend.ops.fused_moe.get_ep_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_mc2_group', return_value=mock_ep_and_mc2_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm.distributed.parallel_state.get_tp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm.model_executor.layers.fused_moe.layer.get_dp_group', return_value=mock_dp_and_tp_group(mocker)), \
+        patch('torch.distributed.all_gather'), \
+        patch('torch.distributed.all_to_all_single'), \
+        patch('vllm_ascend.ops.fused_moe.tensor_model_parallel_all_reduce'), \
+        patch('vllm_ascend.ops.fused_moe.data_parallel_reduce_scatter'), \
+        patch('vllm.model_executor.layers.fused_moe.config.get_dp_group',
+            return_value=mock_dp_and_tp_group(mocker)), \
+        patch('vllm_ascend.ops.fused_moe.get_ascend_config',
+            return_value=MagicMock(
+                torchair_graph_config=MagicMock(enabled=False, enable_multistream_moe=False),
+                expert_map_path=None
+            )), \
+        patch('vllm_ascend.ops.fused_moe.determine_expert_map',
+            return_value=(3, torch.tensor([0, 1, 2, -1, -1, -1, -1, -1]))), \
+        patch('vllm_ascend.ops.fused_moe.get_forward_context',
+            return_value=mock_forward_context_obj), \
        patch('vllm_ascend.ops.fused_moe.get_current_vllm_config',
-               return_value=MagicMock(
-                   parallel_config=MagicMock(tensor_parallel_size=2),
-                   scheduler_config=MagicMock(max_num_seqs=4),
-                   model_config=MagicMock(max_model_len=2048)
-               )), \
+                return_value=MagicMock(
+                    parallel_config=MagicMock(tensor_parallel_size=2),
+                    scheduler_config=MagicMock(max_num_seqs=4),
+                    model_config=MagicMock(max_model_len=2048)
+                )), \
        patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
-        patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers):
+        patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers), \
+        patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context',
+                return_value=mock_forward_context_obj):

        yield {
            'mock_forward_context_obj': mock_forward_context_obj,
@@ -441,12 +443,11 @@ class TestAscendUnquantizedFusedMoEMethod:

            assert result.shape == expected_shape

-    @pytest.mark.parametrize("others_param",
-                             [[16, False], [1, True], [1, False], [4, False]])
+    @pytest.mark.parametrize("others_param", [16, 1, 4])
    def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                   mock_moe_env, others_param):

-        ep_size, alltoall_buffer = others_param
+        ep_size = others_param
        is_prefill = False

        if ep_size == 1:
@@ -464,9 +465,7 @@ class TestAscendUnquantizedFusedMoEMethod:
                                    with_quant=False,
                                    token_dispatcher=selected_token_dispatcher)

-        with patch("vllm_ascend.ops.fused_moe.MOE_ALL2ALL_BUFFER",
-                   alltoall_buffer), \
-             patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
+        with patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context), \
             patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3):

            expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
@@ -475,8 +474,6 @@ class TestAscendUnquantizedFusedMoEMethod:
            if ep_size == 1:
                x = x.view(-1, 2)
            router_logits = torch.randn(8, 8)
-            if alltoall_buffer:
-                moe_method.max_model_len = 1
            layer = MagicMock()

            local_num_experts = 2
@@ -529,9 +526,8 @@ class TestExpertsSelector:

 class TestUnifiedApplyMLP(TestBase):

-    @patch('vllm_ascend.ops.fused_moe.get_forward_context')
-    @patch('vllm_ascend.ops.fused_moe.get_mc2_group')
-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_dynamic_quant')
    @patch('torch_npu.npu_dequant_swiglu_quant')
@@ -539,16 +535,12 @@ class TestUnifiedApplyMLP(TestBase):
                                                     mock_npu_dynamic_quant,
                                                     mock_npu_grouped_matmul,
                                                     mock_is_310p,
-                                                     mock_get_mc2_group,
                                                     mock_get_forward_context):

        mock_forward_context = MagicMock()
        mock_forward_context.fused_moe_state = FusedMoEState.MC2
        mock_get_forward_context.return_value = mock_forward_context

-        mock_mc2_group = MagicMock()
-        mock_get_mc2_group.return_value = mock_mc2_group
-
        mock_is_310p.return_value = False

        mock_npu_dynamic_quant.return_value = (torch.randint(-128,
@@ -601,7 +593,7 @@ class TestUnifiedApplyMLP(TestBase):

        self.assertEqual(result.dtype, torch.bfloat16)

-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
@@ -643,7 +635,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.float16)

-    @patch('vllm_ascend.ops.fused_moe.get_forward_context')
+    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
@@ -703,7 +695,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.bfloat16)

-    @patch('vllm_ascend.ops.fused_moe.is_310p')
+    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -17,57 +17,13 @@

 from unittest.mock import MagicMock, PropertyMock, patch

-import pytest
 import torch
-from pytest_mock import MockerFixture

-from tests.ut.base import PytestBase, TestBase
+from tests.ut.base import TestBase
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
-    AscendSocVersion, MoEAlltoAllSeqOverLapDispatcher, MoEDispatcherConfig,
-    TokenDispatcherWithAll2AllV, TokenDispatcherWithAllGather,
-    TokenDispatcherWithMC2, _Dispatchers, _register_token_dispatcher,
-    get_token_dispatcher, setup_token_dispatchers)
-
-
-class TestMoEAlltoAllSeqOverLapDispatcher(PytestBase):
-
-    @pytest.fixture
-    def config(self):
-        config = MoEDispatcherConfig()
-        config.set_num_local_experts(2)
-        config.set_num_moe_experts(4)
-        config.set_moe_pad_expert_input_to_capacity(False)
-        config.set_moe_expert_capacity_factor(None)
-        config.set_moe_router_topk(2)
-        config.set_moe_grouped_gemm(False)
-        config.set_group_topk(0)
-        config.set_num_groups(1)
-        config.set_is_fused(False)
-        return config.build()
-
-    def mock_ep_group(self, mocker):
-        mock_group = mocker.MagicMock()
-        mock_group.rank_in_group = 0
-        mock_group.world_size = 2
-        mock_group.device_group = "mock_group"
-        return mock_group
-
-    @pytest.fixture
-    def dispatcher(self, config, mocker: MockerFixture):
-        mocker.patch(
-            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
-            return_value=self.mock_ep_group(mocker))
-        mocker.patch("torch.npu.current_device", return_value="cpu")
-        mocker.patch("torch.npu.Stream", return_value=mocker.MagicMock)
-        return MoEAlltoAllSeqOverLapDispatcher(config)
-
-    def test_initialization(self, dispatcher, config):
-        assert dispatcher.num_local_experts == config.num_local_experts
-        assert dispatcher.num_experts == config.num_moe_experts
-        assert dispatcher.local_expert_indices == [0, 1]
-        assert dispatcher.ep_rank == 0
-        assert dispatcher.ep_size == 2
-        assert dispatcher.overlap_stream is not None
+    AscendSocVersion, TokenDispatcherWithAll2AllV,
+    TokenDispatcherWithAllGather, TokenDispatcherWithMC2, _Dispatchers,
+    _register_token_dispatcher, get_token_dispatcher, setup_token_dispatchers)


 class TestTokenDispatcherWithMC2(TestBase):
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -353,8 +353,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
            else:
                assert result.shape == x.shape

-    @pytest.mark.parametrize("others_param",
-                             [[16, False], [1, True], [1, False], [4, False]])
+    @pytest.mark.parametrize("others_param", [16, 1, 4])
    def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                   mock_moe_env, others_param):
        """
@@ -363,13 +362,11 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
        3 test use_select_experts and fused_experts_with_all2all
        4 test use_select_experts and fused_experts
        """
-        ep_size, alltoall_buffer = others_param
+        ep_size = others_param
        is_prefill = False
        forward_context = MagicMock(
            fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
-        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.MOE_ALL2ALL_BUFFER",
-                   alltoall_buffer), \
-             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
+        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
            expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
            moe_method.ep_size = ep_size
@@ -377,8 +374,6 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
            if ep_size == 1:
                x = x.view(-1, 2)
            router_logits = torch.randn(8, 8)
-            if alltoall_buffer:
-                moe_method.max_model_len = 1
            layer = MagicMock()
            layer.w13_weight = torch.randn(8, 16, 1)
            layer.w2_weight = torch.randn(16, 8, 1)