[main] [refactor] refactor fused_moe.py to enable token_dispatchers (#2570)

### What this PR does / why we need it? Enable token_dispatcher to replace fused_experts_with_xxx in eager mode ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? e2e & ut - vLLM version: v0.10.1.1 - vLLM main: 704432af3c Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: sherie <963372609@qq.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com> Co-authored-by: shiyuan680 <72335504+shiyuan680@users.noreply.github.com>
2025-08-28 10:13:35 +08:00
parent 936c102105
commit 320edde2df
10 changed files with 1066 additions and 1639 deletions
--- a/tests/ut/quantization/test_w8a8_dynamic.py
+++ b/tests/ut/quantization/test_w8a8_dynamic.py
@@ -1,82 +0,0 @@
-from unittest.mock import MagicMock, patch
-
-import torch
-
-from tests.ut.base import TestBase
-from vllm_ascend.quantization.w8a8_dynamic import fused_experts_with_all2all
-
-
-class TestAscendW8A8FusedMoEMethod(TestBase):
-
-    def setUp(self):
-        self.hidden_size = 128
-        self.num_tokens = 128
-        self.placeholder = torch.randn(self.num_tokens,
-                                       self.hidden_size,
-                                       dtype=torch.bfloat16)
-
-    @patch("torch.distributed.all_to_all_single")
-    @patch("torch_npu.npu_moe_re_routing")
-    @patch("torch_npu.npu_grouped_matmul")
-    @patch("torch_npu.npu_swiglu")
-    @patch("torch_npu.npu_dynamic_quant")
-    @patch("torch_npu.npu_moe_finalize_routing")
-    @patch("torch_npu.npu_moe_init_routing")
-    def test_fused_experts_with_all2all(self, mock_moe_init_routing,
-                                        mock_moe_finalize_routing,
-                                        mock_dynamic_quant, mock_swiglu,
-                                        mock_grouped_matmul,
-                                        mock_moe_re_routing,
-                                        mock_all_to_all_single):
-        expert_map = MagicMock()
-        ep_group = MagicMock()
-        placeholder_int8 = torch.randint(0,
-                                         100,
-                                         (self.num_tokens, self.hidden_size),
-                                         dtype=torch.int8)
-        placeholder_ones = torch.ones(self.num_tokens, dtype=torch.int32)
-        mock_all_to_all_single.side_effect = lambda output, input, *args, **kwargs: output.copy_(
-            input)
-        mock_moe_init_routing.return_value = (
-            placeholder_int8,
-            placeholder_ones,
-            placeholder_ones,
-        )
-        mock_moe_re_routing.return_value = (placeholder_int8, self.placeholder,
-                                            torch.randint(0,
-                                                          100,
-                                                          (self.num_tokens, ),
-                                                          dtype=torch.int32),
-                                            self.placeholder)
-        mock_grouped_matmul.return_value = self.placeholder
-        mock_swiglu.return_value = self.placeholder
-        mock_dynamic_quant.return_value = (
-            placeholder_int8,
-            torch.randn(self.num_tokens),
-        )
-        mock_moe_finalize_routing.return_value = self.placeholder
-        row_idx_len = self.num_tokens * 8
-        row_idx = (torch.arange(
-            0,
-            row_idx_len,
-            dtype=torch.int32,
-        ).view(8, -1).permute(1, 0).contiguous())
-
-        result = fused_experts_with_all2all(
-            hidden_states=self.placeholder,
-            w1=self.placeholder,
-            w1_scale=self.placeholder,
-            w2=self.placeholder,
-            w2_scale=self.placeholder,
-            topk_weights=self.placeholder,
-            topk_ids=self.placeholder,
-            row_idx=row_idx,
-            top_k=8,
-            expert_map=expert_map,
-            ep_group=ep_group,
-            log2phy=None,
-            global_redundant_expert_num=256,
-        )
-        self.assertIsNotNone(result)
-        self.assertEqual(result.dtype, torch.bfloat16)
-        self.assertEqual(result.shape, (128, 128))