[main] [refactor] refactor common_fused_moe.py (#2706)

### What this PR does / why we need it? 1. Move prepare/finalize operation from moe_comm_method to /ops/moe/fused_moe_prepare_and_finalize 2. Adapt to token_dispatcher in moe_comm_method 3. Move moe_comm_method/experts_selector/token_dispatcher/fused_moe_prepare_and_finalize to /ops/moe ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? e2e & ut - vLLM version: v0.10.1.1 - vLLM main: f4962a6d55 Signed-off-by: weichen <calvin_zhu0210@outlook.com> Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-09-08 20:09:50 +08:00
parent 1a82b16355
commit a041d4f328
21 changed files with 1052 additions and 932 deletions
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -28,9 +28,8 @@ import torch
 import torch_npu
 from vllm.model_executor.layers.activation import SiluAndMul

-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
-    TokenDispatcherWithAllGather
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather

 NUM_EXPERTS = [8, 64]
 EP_SIZE = [1]
@@ -209,7 +208,7 @@ def test_select_experts(
                                 dtype=torch.int32)
        custom_routing_function.return_value = (mock_weights, mock_ids)

-    with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
+    with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
               ) as mock_native_grouped_topk:
        mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
            x)
--- a/tests/e2e/singlecard/ops/test_moe_comm.py
+++ b/tests/e2e/singlecard/ops/test_moe_comm.py
@@ -1,175 +0,0 @@
-# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# This file is a part of the vllm-ascend project.
-
-import gc
-from types import SimpleNamespace
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.fused_moe.config import (  # isort: skip
-    FusedMoEConfig, FusedMoEParallelConfig)
-
-from vllm_ascend.distributed.moe_comm_method import (  # isort: skip
-    AllGatherCommImpl, NativeAllGatherCommImpl)
-
-
-@pytest.mark.parametrize("num_tokens", [16, 128])
-@pytest.mark.parametrize("hidden_size", [64, 128])
-@pytest.mark.parametrize("global_num_experts", [8, 16])
-@pytest.mark.parametrize("num_local_experts", [4, 8])
-@pytest.mark.parametrize("top_k_num", [2, 4])
-@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-@pytest.mark.parametrize("ep_rank", [0, 1])
-@pytest.mark.parametrize("apply_a8_quantization", [False])
-def test_all_gather_comm_impl(
-    num_tokens,
-    hidden_size,
-    global_num_experts,
-    num_local_experts,
-    top_k_num,
-    dtype,
-    ep_rank,
-    apply_a8_quantization,
-    mocker,
-):
-    """
-    Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
-
-    This test compares the outputs of the NPU-optimized AllGatherCommImpl
-    with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
-    correctness across various configurations.
-    """
-    if top_k_num > global_num_experts:
-        pytest.skip("top_k_num cannot be greater than global_num_experts")
-    if num_local_experts > global_num_experts:
-        pytest.skip(
-            "num_local_experts cannot be greater than global_num_experts")
-
-    device = torch.device("npu")
-
-    # mock get_tensor_model_parallel_rank to return ep_rank
-    mocker.patch(
-        "vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
-        return_value=ep_rank,
-    )
-
-    # make moe config
-    parallel_config = SimpleNamespace(
-        enable_expert_parallel=num_local_experts < global_num_experts)
-    moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
-        tp_size_=max(2, global_num_experts // num_local_experts),
-        dp_size_=1,
-        vllm_parallel_config=parallel_config,
-    )
-
-    moe_config = FusedMoEConfig(
-        num_experts=global_num_experts,
-        experts_per_token=top_k_num,
-        hidden_dim=hidden_size,
-        num_local_experts=num_local_experts,
-        moe_parallel_config=moe_parallel_config,
-        in_dtype=dtype,
-        quant_config=None,  # No quantization in this test
-        max_num_tokens=num_tokens,
-    )
-
-    # Instantiate implementations
-    native_impl = NativeAllGatherCommImpl(moe_config)
-
-    all_gather_impl = AllGatherCommImpl(moe_config)
-
-    # --- Input Data ---
-    hidden_states = torch.randn(num_tokens,
-                                hidden_size,
-                                device=device,
-                                dtype=dtype)
-    topk_ids = torch.randint(0,
-                             global_num_experts, (num_tokens, top_k_num),
-                             device=device,
-                             dtype=torch.int32)
-    topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
-    topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
-
-    num_experts = global_num_experts
-
-    expert_map = None
-    if num_local_experts < global_num_experts:
-        # Create a map where some experts are local and some are not
-        expert_map = torch.full((global_num_experts, ), -1, device=device)
-        expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
-                   num_local_experts] = torch.arange(num_local_experts,
-                                                     device=device)
-    num_experts = num_local_experts
-
-    # --- Run Native Implementation (Golden Reference) ---
-    native_hidden_states_out = hidden_states.clone()
-    (
-        native_permuted_hidden,
-        native_expert_tokens,
-        _,
-        _,
-    ) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
-                            num_experts, apply_a8_quantization)
-    # Simulate MLP output
-    native_mlp_output = torch.randn_like(native_permuted_hidden)
-    native_impl.unpermute(native_mlp_output, native_hidden_states_out)
-
-    # --- Run AllGather Implementation ---
-    all_gather_hidden_states_out = hidden_states.clone()
-    (
-        all_gather_permuted_hidden,
-        all_gather_expert_tokens,
-        _,
-        _,
-    ) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
-                                expert_map, num_experts, apply_a8_quantization)
-
-    # Use the same simulated MLP output for a fair comparison
-    all_gather_mlp_output = native_mlp_output.clone()
-
-    all_gather_impl.unpermute(all_gather_mlp_output,
-                              all_gather_hidden_states_out)
-
-    # --- Assertions ---
-    # Define tolerance based on dtype
-    atol = 1e-3 if dtype == torch.float16 else 1e-2
-    rtol = 1e-3 if dtype == torch.float16 else 1e-2
-
-    # 1. Compare expert_tokens from pre_process
-    assert torch.allclose(native_expert_tokens.to(
-        all_gather_expert_tokens.device),
-                          all_gather_expert_tokens,
-                          atol=atol,
-                          rtol=rtol), "Expert tokens do not match."
-
-    # 2. Compare permuted_hidden_states from pre_process
-    num_valid_tokens = native_expert_tokens.sum()
-    assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
-        all_gather_permuted_hidden.device),
-                          all_gather_permuted_hidden[:num_valid_tokens],
-                          atol=atol,
-                          rtol=rtol), "Permuted hidden states do not match."
-
-    # 3. Compare final hidden_states from post_process
-    assert torch.allclose(native_hidden_states_out.to(
-        all_gather_hidden_states_out.device),
-                          all_gather_hidden_states_out,
-                          atol=atol,
-                          rtol=rtol), "Final hidden states do not match."
-    gc.collect()
-    torch.npu.empty_cache()
-    torch.npu.reset_peak_memory_stats()
--- a/tests/ut/ops/test_fused_moe_prepare_and_finalize.py
+++ b/tests/ut/ops/test_fused_moe_prepare_and_finalize.py
@@ -0,0 +1,218 @@
+import unittest
+from unittest.mock import MagicMock, patch
+
+import torch
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+
+from vllm_ascend.ops.moe.fused_moe_prepare_and_finalize import (
+    FusedMoEPrepareAndFinalizeWithAll2All,
+    FusedMoEPrepareAndFinalizeWithAllGather, FusedMoEPrepareAndFinalizeWithMC2)
+
+
+class TestFusedMoEPrepareAndFinalize(unittest.TestCase):
+
+    def setUp(self):
+        # Mock FusedMoEConfig
+        self.moe_config = MagicMock(spec=FusedMoEConfig)
+        self.moe_config.tp_group = MagicMock()
+        self.moe_config.tp_group.device_group = MagicMock()
+        self.moe_config.dp_size = 1
+        self.moe_config.tp_size = 1
+        self.moe_config.ep_size = 1
+        self.moe_config.dp_group = MagicMock()
+
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_world_size",
+        return_value=1)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_rank",
+        return_value=0)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_forward_context"
+    )
+    def test_mc2_prepare_finalize(self, mock_get_forward_context, mock_tp_rank,
+                                  mock_tp_size):
+        mock_context = MagicMock()
+        mock_context.mc2_mask = torch.tensor([1, 0, 1])
+        mock_context.padded_num_tokens = 4
+        mock_get_forward_context.return_value = mock_context
+
+        layer = FusedMoEPrepareAndFinalizeWithMC2(self.moe_config)
+
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+
+        h_out, r_out, mask = layer.prepare(hidden_states, router_logits)
+
+        # Check padding and split
+        self.assertEqual(h_out.shape[0], 4)
+        self.assertEqual(r_out.shape[0], 4)
+        self.assertEqual(mask.tolist(), [1, 0, 1])
+
+        # Finalize
+        result = layer.finalize(h_out, reduce_results=False)
+        self.assertEqual(result.shape[0], 3)
+
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_world_size",
+        return_value=2)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_rank",
+        return_value=0)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_forward_context"
+    )
+    @patch("torch.distributed.all_gather")
+    def test_mc2_tp_split_allgather(self, mock_all_gather,
+                                    mock_get_forward_context, mock_tp_rank,
+                                    mock_tp_size):
+        mock_context = MagicMock()
+        mock_context.mc2_mask = torch.tensor([1, 0, 1, 0])
+        mock_context.padded_num_tokens = 4
+        mock_get_forward_context.return_value = mock_context
+
+        layer = FusedMoEPrepareAndFinalizeWithMC2(self.moe_config)
+        hidden_states = torch.randn(4, 8)
+        router_logits = torch.randn(4, 2)
+
+        h_out, r_out, mask = layer.prepare(hidden_states,
+                                           router_logits,
+                                           enable_shared_expert_dp=False,
+                                           replace_allreduce=False)
+
+        # With TP=2, should split into 2 parts
+        self.assertEqual(h_out.shape[0], 2)
+
+        # Mock all_gather behavior
+        def mock_all_gather_func(tensor_list, tensor, group=None):
+            tensor_list[0] = tensor
+            tensor_list[1] = tensor.clone()
+
+        mock_all_gather.side_effect = mock_all_gather_func
+
+        layer.split_hidden_states = [
+            torch.zeros_like(h_out),
+            torch.zeros_like(h_out)
+        ]
+        final_result = layer.finalize(h_out, reduce_results=False)
+
+        # Should concat back to original size
+        self.assertEqual(final_result.shape[0], 4)
+
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_world_size",
+        return_value=1)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_rank",
+        return_value=0)
+    def test_all2all_prepare_finalize(self, mock_tp_rank, mock_tp_size):
+        layer = FusedMoEPrepareAndFinalizeWithAll2All(self.moe_config)
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+
+        h_out, r_out, _ = layer.prepare(hidden_states, router_logits)
+
+        # Pad to tp_size=1, so no change
+        self.assertEqual(h_out.shape[0], 3)
+
+        result = layer.finalize(h_out, reduce_results=False)
+        self.assertEqual(result.shape[0], 3)
+
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_world_size",
+        return_value=2)
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_tensor_model_parallel_rank",
+        return_value=0)
+    @patch("torch.distributed.all_gather")
+    def test_all2all_tp_split_allgather(self, mock_all_gather, mock_tp_rank,
+                                        mock_tp_size):
+        layer = FusedMoEPrepareAndFinalizeWithAll2All(self.moe_config)
+        hidden_states = torch.randn(2, 8)
+        router_logits = torch.randn(2, 2)
+
+        h_out, r_out, _ = layer.prepare(hidden_states,
+                                        router_logits,
+                                        enable_shared_expert_dp=False,
+                                        replace_allreduce=False)
+
+        # Split due to TP=2
+        self.assertEqual(h_out.shape[0], 1)
+
+        # Mock all_gather
+        def mock_all_gather_func(tensor_list, tensor, group=None):
+            tensor_list[0] = tensor
+            tensor_list[1] = tensor.clone()
+
+        mock_all_gather.side_effect = mock_all_gather_func
+
+        layer.split_hidden_states = [
+            torch.zeros_like(h_out),
+            torch.zeros_like(h_out)
+        ]
+        final_result = layer.finalize(h_out, reduce_results=False)
+
+        # Should concat back
+        self.assertEqual(final_result.shape[0], 2)
+
+    @patch("vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_dp_group")
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.tensor_model_parallel_all_reduce"
+    )
+    @patch(
+        "vllm_ascend.ops.moe.fused_moe_prepare_and_finalize.get_forward_context"
+    )
+    def test_allgather_prepare_finalize(self, mock_get_forward_context,
+                                        mock_tp_all_reduce, mock_get_dp_group):
+        # Mock forward context
+        mock_context = MagicMock()
+        mock_context.max_tokens_across_dp = 6
+        mock_get_forward_context.return_value = mock_context
+
+        # Create a proper mock for DP group with working all_gather
+        mock_dp_group = MagicMock()
+
+        def mock_all_gather_func(tensor, dim):
+            # Simulate DP=2: repeat the tensor along the specified dimension
+            return torch.cat([tensor, tensor], dim=dim)
+
+        mock_dp_group.all_gather = mock_all_gather_func
+        mock_get_dp_group.return_value = mock_dp_group
+
+        self.moe_config.dp_size = 2
+        self.moe_config.tp_size = 1
+        self.moe_config.ep_size = 1
+        self.moe_config.dp_group = mock_dp_group
+
+        layer = FusedMoEPrepareAndFinalizeWithAllGather(self.moe_config)
+
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+
+        # Mock the gate function for rm_router_logits=False case
+        mock_gate = MagicMock()
+        mock_gate.return_value = (router_logits.repeat(2, 1), None)
+
+        h_out, r_out, _ = layer.prepare(hidden_states,
+                                        router_logits,
+                                        rm_router_logits=False,
+                                        gate=mock_gate)
+
+        # After all-gather with DP=2, should double the batch size
+        self.assertEqual(h_out.shape[0], 12)
+        self.assertEqual(r_out.shape[0], 12)
+
+        # Finalize with reduce_scatter
+        def mock_reduce_scatter_func(tensor, dim):
+            # Simulate reduce_scatter: take first half
+            return tensor[:3]
+
+        mock_dp_group.reduce_scatter = mock_reduce_scatter_func
+        result = layer.finalize(h_out, reduce_results=False)
+
+        self.assertEqual(result.shape[0], 3)
+
+        # Test with TP all-reduce
+        mock_tp_all_reduce.return_value = result
+        result_with_tp = layer.finalize(h_out, reduce_results=True)
+        self.assertEqual(result_with_tp.shape[0], 3)
--- a/tests/ut/ops/test_fused_ops.py
+++ b/tests/ut/ops/test_fused_ops.py
@@ -22,14 +22,14 @@ import torch_npu
 from pytest_mock import MockerFixture
 from vllm.model_executor.layers.fused_moe import FusedMoEMethodBase

-import vllm_ascend.ops.moe_dispatcher.token_dispatcher as token_dispatcher_module
+import vllm_ascend.ops.moe.token_dispatcher as token_dispatcher_module
 from tests.ut.base import TestBase
 from vllm_ascend.ascend_forward_context import (FusedMoEState,
                                                _get_fused_moe_state)
 from vllm_ascend.ops.fused_moe import (AscendFusedMoE,
                                       AscendUnquantizedFusedMoEMethod)
-from vllm_ascend.ops.layers.experts_selector import select_experts
-from vllm_ascend.ops.layers.moe_mlp import cumsum_group_list, unified_apply_mlp
+from vllm_ascend.ops.moe.experts_selector import select_experts
+from vllm_ascend.ops.moe.moe_mlp import cumsum_group_list, unified_apply_mlp
 from vllm_ascend.utils import AscendSocVersion, adapt_patch

 adapt_patch(True)
@@ -110,11 +110,11 @@ def mock_dist_env(mocker: MockerFixture):
            captured_dispatchers[key] = mock_token_dispatcher_with_mc2

    mock_register_token_dispatcher_patcher = patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher',
+        'vllm_ascend.ops.moe.token_dispatcher._register_token_dispatcher',
        side_effect=capture_register)

    mock_get_token_dispatcher_patcher = patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_token_dispatcher',
+        'vllm_ascend.ops.moe.token_dispatcher.get_token_dispatcher',
        side_effect=lambda name: captured_dispatchers.get(name))

    default_mock_token_dispatcher = mock_token_dispatcher_with_allgather
@@ -158,7 +158,7 @@ def mock_dist_env(mocker: MockerFixture):
                )), \
        patch("vllm_ascend.utils.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
        patch.object(token_dispatcher_module, 'setup_token_dispatchers', mock_setup_token_dispatchers), \
-        patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context',
+        patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context',
                return_value=mock_forward_context_obj):

        yield {
@@ -562,8 +562,8 @@ class TestCumsumGroupList(TestBase):

 class TestUnifiedApplyMLP(TestBase):

-    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
-    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_dynamic_quant')
    @patch('torch_npu.npu_dequant_swiglu_quant')
@@ -629,7 +629,7 @@ class TestUnifiedApplyMLP(TestBase):

        self.assertEqual(result.dtype, torch.bfloat16)

-    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
@@ -671,7 +671,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.float16)

-    @patch('vllm_ascend.ops.layers.moe_mlp.get_forward_context')
+    @patch('vllm_ascend.ops.moe.moe_mlp.get_forward_context')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
@@ -731,7 +731,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.bfloat16)

-    @patch('vllm_ascend.ops.layers.moe_mlp.is_310p')
+    @patch('vllm_ascend.ops.moe.moe_mlp.is_310p')
    @patch('torch_npu.npu_grouped_matmul')
    @patch('torch_npu.npu_swiglu')
    @patch('torch_npu.npu_dynamic_quant')
@@ -776,7 +776,7 @@ class TestUnifiedApplyMLP(TestBase):
        self.assertEqual(result.shape, hidden_states.shape)
        self.assertEqual(result.dtype, torch.float16)

-    @patch("vllm_ascend.ops.layers.moe_mlp.get_forward_context")
+    @patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context")
    @patch("torch_npu.npu_grouped_matmul")
    @patch("torch_npu.npu_swiglu")
    @patch("torch_npu.npu_grouped_matmul_swiglu_quant")
--- a/tests/ut/ops/test_moe_comm_method.py
+++ b/tests/ut/ops/test_moe_comm_method.py
@@ -0,0 +1,212 @@
+from unittest.mock import MagicMock, patch
+
+import torch
+from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+
+from tests.ut.base import TestBase
+from vllm_ascend.ops.moe.moe_comm_method import (AllGatherCommImpl,
+                                                 AlltoAllCommImpl, MC2CommImpl)
+
+
+class TestMoECommMethod(TestBase):
+
+    def setUp(self):
+        # Mock FusedMoEConfig
+        self.moe_config = MagicMock(spec=FusedMoEConfig)
+        self.moe_config.num_experts = 8
+        self.moe_config.num_local_experts = 2
+        self.moe_config.experts_per_token = 2
+        self.moe_config.tp_group = MagicMock()
+        self.moe_config.tp_group.device_group = MagicMock()
+        self.moe_config.dp_size = 1
+        self.moe_config.tp_size = 1
+        self.moe_config.ep_size = 1
+        self.moe_config.dp_group = MagicMock()
+        self.moe_config.num_global_redundant_experts = 0
+
+    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch(
+        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAllGather"
+    )
+    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAllGather")
+    def test_all_gather_comm_impl(self, mock_token_dispatcher,
+                                  mock_prepare_finalize,
+                                  mock_get_forward_context):
+        # Mock forward context
+        mock_context = MagicMock()
+        mock_context.moe_comm_method = "all_gather"
+        mock_get_forward_context.return_value = mock_context
+
+        # Mock prepare finalize
+        mock_pf_instance = MagicMock()
+        mock_pf_instance.prepare.return_value = (torch.randn(4, 8),
+                                                 torch.randn(4, 2), None)
+        mock_pf_instance.finalize.return_value = torch.randn(4, 8)
+        mock_prepare_finalize.return_value = mock_pf_instance
+
+        # Mock token dispatcher
+        mock_td_instance = MagicMock()
+        mock_token_dispatcher.return_value = mock_td_instance
+
+        # Create instance
+        comm_impl = AllGatherCommImpl(self.moe_config)
+
+        # Test prepare method
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+        h_out, r_out = comm_impl.prepare(hidden_states, router_logits)
+
+        # Verify prepare was called with correct arguments
+        mock_pf_instance.prepare.assert_called_once_with(
+            hidden_states, router_logits, False, False, False, None)
+
+        # Test finalize method
+        comm_impl.finalize(h_out, reduce_results=True)
+        mock_pf_instance.finalize.assert_called_once_with(h_out, True)
+
+    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch(
+        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithMC2"
+    )
+    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithMC2")
+    def test_mc2_comm_impl(self, mock_token_dispatcher, mock_prepare_finalize,
+                           mock_get_forward_context):
+        # Mock forward context
+        mock_context = MagicMock()
+        mock_context.moe_comm_method = "mc2"
+        mock_get_forward_context.return_value = mock_context
+
+        # Mock prepare finalize
+        mock_pf_instance = MagicMock()
+        mock_pf_instance.prepare.return_value = (torch.randn(4, 8),
+                                                 torch.randn(4, 2),
+                                                 torch.tensor([1, 0, 1, 0]))
+        mock_pf_instance.finalize.return_value = torch.randn(4, 8)
+        mock_prepare_finalize.return_value = mock_pf_instance
+
+        # Mock token dispatcher
+        mock_td_instance = MagicMock()
+        mock_token_dispatcher.return_value = mock_td_instance
+
+        # Create instance
+        comm_impl = MC2CommImpl(self.moe_config)
+
+        # Test prepare method
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+        h_out, r_out = comm_impl.prepare(hidden_states, router_logits)
+
+        # Verify prepare was called with correct arguments
+        mock_pf_instance.prepare.assert_called_once_with(
+            hidden_states, router_logits, False, False, False, None)
+
+        # Test finalize method
+        comm_impl.finalize(h_out, reduce_results=True)
+        mock_pf_instance.finalize.assert_called_once_with(h_out, True)
+
+    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch(
+        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAll2All"
+    )
+    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAll2AllV")
+    def test_alltoall_comm_impl(self, mock_token_dispatcher,
+                                mock_prepare_finalize,
+                                mock_get_forward_context):
+        # Mock forward context
+        mock_context = MagicMock()
+        mock_context.moe_comm_method = "alltoall"
+        mock_get_forward_context.return_value = mock_context
+
+        # Mock prepare finalize
+        mock_pf_instance = MagicMock()
+        mock_pf_instance.prepare.return_value = (torch.randn(4, 8),
+                                                 torch.randn(4, 2), None)
+        mock_pf_instance.finalize.return_value = torch.randn(4, 8)
+        mock_prepare_finalize.return_value = mock_pf_instance
+
+        # Mock token dispatcher
+        mock_td_instance = MagicMock()
+        mock_token_dispatcher.return_value = mock_td_instance
+
+        # Create instance
+        comm_impl = AlltoAllCommImpl(self.moe_config)
+
+        # Test prepare method
+        hidden_states = torch.randn(3, 8)
+        router_logits = torch.randn(3, 2)
+        h_out, r_out = comm_impl.prepare(hidden_states, router_logits)
+
+        # Verify prepare was called with correct arguments
+        mock_pf_instance.prepare.assert_called_once_with(
+            hidden_states, router_logits, False, False, False, None)
+
+    @patch("vllm_ascend.ops.moe.moe_comm_method.get_forward_context")
+    @patch(
+        "vllm_ascend.ops.moe.moe_comm_method.FusedMoEPrepareAndFinalizeWithAllGather"
+    )
+    @patch("vllm_ascend.ops.moe.moe_comm_method.TokenDispatcherWithAllGather")
+    @patch("vllm_ascend.ops.moe.moe_comm_method.unified_apply_mlp")
+    def test_fused_experts_method(self, mock_unified_apply_mlp,
+                                  mock_token_dispatcher, mock_prepare_finalize,
+                                  mock_get_forward_context):
+        # Mock forward context
+        mock_context = MagicMock()
+        mock_context.moe_comm_method = "all_gather"
+        mock_get_forward_context.return_value = mock_context
+
+        # Mock prepare finalize
+        mock_pf_instance = MagicMock()
+        mock_pf_instance.prepare.return_value = (torch.randn(4, 8),
+                                                 torch.randn(4, 2), None)
+        mock_pf_instance.finalize.return_value = torch.randn(4, 8)
+        mock_prepare_finalize.return_value = mock_pf_instance
+
+        # Mock token dispatcher
+        mock_td_instance = MagicMock()
+        mock_td_instance.token_dispatch.return_value = {
+            "hidden_states": torch.randn(6, 8),
+            "group_list": torch.tensor([2, 2, 2]),
+            "group_list_type": 1
+        }
+        mock_td_instance.token_combine.return_value = torch.randn(4, 8)
+        mock_token_dispatcher.return_value = mock_td_instance
+
+        # Mock unified_apply_mlp
+        mock_unified_apply_mlp.return_value = torch.randn(6, 8)
+
+        # Create instance
+        comm_impl = AllGatherCommImpl(self.moe_config)
+
+        # Test fused_experts method
+        hidden_states = torch.randn(4, 8).contiguous()
+        w1 = torch.randn(16, 8).contiguous()
+        w2 = torch.randn(16, 8).contiguous()
+        topk_weights = torch.tensor([[0.5, 0.5], [0.3, 0.7], [0.8, 0.2],
+                                     [0.6, 0.4]])
+        topk_ids = torch.tensor([[0, 1], [1, 2], [2, 0], [1, 1]])
+        row_idx = torch.arange(4)
+
+        # Make sure tensors are contiguous and have correct strides
+        hidden_states = hidden_states.contiguous()
+        w1 = w1.contiguous()
+        w2 = w2.contiguous()
+
+        result = comm_impl.fused_experts(hidden_states=hidden_states,
+                                         w1=w1,
+                                         w2=w2,
+                                         topk_weights=topk_weights,
+                                         topk_ids=topk_ids,
+                                         row_idx=row_idx,
+                                         activation="silu")
+
+        # Verify result shape
+        self.assertEqual(result.shape, (4, 8))
+
+        # Verify token_dispatch was called
+        mock_td_instance.token_dispatch.assert_called_once()
+
+        # Verify unified_apply_mlp was called
+        mock_unified_apply_mlp.assert_called_once()
+
+        # Verify token_combine was called
+        mock_td_instance.token_combine.assert_called_once()
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -20,7 +20,8 @@ from unittest.mock import MagicMock, PropertyMock, patch
 import torch

 from tests.ut.base import TestBase
-from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
+
+from vllm_ascend.ops.moe.token_dispatcher import (  # isort: skip
    AscendSocVersion, TokenDispatcherWithAll2AllV,
    TokenDispatcherWithAllGather, TokenDispatcherWithMC2, _Dispatchers,
    _register_token_dispatcher, get_token_dispatcher, setup_token_dispatchers)
@@ -34,7 +35,7 @@ class TestTokenDispatcherWithMC2(TestBase):
        self.mc2_group.rank_in_group = 0
        self.mc2_group.world_size = 8
        self.mc2_group_patch = patch(
-            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_mc2_group",
+            "vllm_ascend.ops.moe.token_dispatcher.get_mc2_group",
            return_value=self.mc2_group)
        self.mc2_group_patch.start()

@@ -52,7 +53,7 @@ class TestTokenDispatcherWithMC2(TestBase):

        # Mock get_ascend_soc_version()
        self.ascend_soc_version_patch = patch(
-            "vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ascend_soc_version",
+            "vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version",
            return_value=AscendSocVersion.A3)
        self.ascend_soc_version_patch.start()

@@ -329,7 +330,7 @@ class TestTokenDispatcherWithAll2AllV(TestBase):

        # Mock gather_from_sequence_parallel_region
        patcher7 = patch(
-            'vllm_ascend.ops.moe_dispatcher.token_dispatcher.gather_from_sequence_parallel_region'
+            'vllm_ascend.ops.moe.token_dispatcher.gather_from_sequence_parallel_region'
        )
        self.mock_gather_from_sequence_parallel_region = patcher7.start()
        self.addCleanup(patcher7.stop)
@@ -518,12 +519,8 @@ class TestDispatcherRegistry(TestBase):

        self.assertIsNone(get_token_dispatcher("NonExistentDispatcher"))

-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAllGather'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher'
-    )
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithAllGather')
+    @patch('vllm_ascend.ops.moe.token_dispatcher._register_token_dispatcher')
    def test_setup_token_dispatchers_ep_size_1_creates_allgather(
            self, mock_register, mock_allgather_class):
        kwargs = {"top_k": 2, "num_experts": 8}
@@ -537,12 +534,8 @@ class TestDispatcherRegistry(TestBase):
        mock_allgather_class.assert_called_once_with(**kwargs)
        mock_register.assert_called_once_with(mock_instance)

-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher'
-    )
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithAll2AllV')
+    @patch('vllm_ascend.ops.moe.token_dispatcher._register_token_dispatcher')
    def test_setup_token_dispatchers_ep_size_2_creates_all2allv(
            self, mock_register, mock_all2allv_class):
        kwargs = {"top_k": 2, "num_experts": 16, "num_local_experts": 2}
@@ -556,15 +549,9 @@ class TestDispatcherRegistry(TestBase):
        mock_all2allv_class.assert_called_once_with(**kwargs)
        mock_register.assert_called_once_with(mock_instance)

-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithMC2'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher'
-    )
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithAll2AllV')
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithMC2')
+    @patch('vllm_ascend.ops.moe.token_dispatcher._register_token_dispatcher')
    def test_setup_token_dispatchers_ep_size_16_creates_all2allv_and_mc2(
            self, mock_register, mock_mc2_class, mock_all2allv_class):
        kwargs = {"top_k": 2, "num_experts": 32, "num_local_experts": 2}
@@ -584,15 +571,9 @@ class TestDispatcherRegistry(TestBase):
        mock_register.assert_any_call(mock_all2allv_instance)
        mock_register.assert_any_call(mock_mc2_instance)

-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithAll2AllV'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher.TokenDispatcherWithMC2'
-    )
-    @patch(
-        'vllm_ascend.ops.moe_dispatcher.token_dispatcher._register_token_dispatcher'
-    )
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithAll2AllV')
+    @patch('vllm_ascend.ops.moe.token_dispatcher.TokenDispatcherWithMC2')
+    @patch('vllm_ascend.ops.moe.token_dispatcher._register_token_dispatcher')
    def test_setup_token_dispatchers_ep_size_16_skips_if_exist(
            self, mock_register, mock_mc2_class, mock_all2allv_class):
        kwargs = {"top_k": 2, "num_experts": 32, "num_local_experts": 2}
--- a/tests/ut/quantization/test_w8a8.py
+++ b/tests/ut/quantization/test_w8a8.py
@@ -5,8 +5,8 @@ import torch

 from tests.ut.base import TestBase
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
-from vllm_ascend.ops.layers.experts_selector import (_native_grouped_topk,
-                                                     select_experts)
+from vllm_ascend.ops.moe.experts_selector import (_native_grouped_topk,
+                                                  select_experts)
 from vllm_ascend.quantization.w8a8 import (AscendC8KVCacheMethod,
                                           AscendW8A8FusedMoEMethod,
                                           AscendW8A8LinearMethod,
@@ -784,7 +784,7 @@ class TestSelectExperts(TestBase):
        self.assertEqual(ids.shape, (self.num_tokens, self.top_k))
        self.assertEqual(ids.dtype, torch.int32)

-    @patch('vllm_ascend.ops.layers.experts_selector._native_grouped_topk')
+    @patch('vllm_ascend.ops.moe.experts_selector._native_grouped_topk')
    def test_grouped_topk_with_correction_bias(self, mock_grouped_topk):
        """Test grouped topk with expert score correction bias"""
        mock_grouped_topk.return_value = torch.ones(self.num_tokens,