[Feature] Support fine-grained shared expert overlap (#5482)

Fine-grained control over shared expert overlap to prevent resource contention. - vLLM version: v0.13.0 - vLLM main: 5326c89803 --------- Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2026-01-17 11:53:22 +08:00
parent 48e10de8c9
commit 22f253142a
9 changed files with 203 additions and 130 deletions
--- a/tests/ut/ops/test_moe_comm_method.py
+++ b/tests/ut/ops/test_moe_comm_method.py
@@ -163,6 +163,7 @@ class TestMoECommMethod(TestBase):
        "vllm_ascend.ops.fused_moe.moe_comm_method.TokenDispatcherWithAllGather"
    )
    @patch("vllm_ascend.ops.fused_moe.moe_comm_method.unified_apply_mlp")
+    @patch("torch.npu.current_stream", MagicMock())
    def test_fused_experts_method(self, mock_unified_apply_mlp,
                                  mock_token_dispatcher, mock_prepare_finalize,
                                  mock_get_forward_context):
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -116,26 +116,6 @@ class TestTokenDispatcherWithMC2(TestBase):
            mock_dispatch.assert_called_once()
            self.assertEqual(output.group_list_type, 0)  # group_list_type == 0

-    def test_token_dispatch_with_shared_experts_and_quant(self):
-        self.shared_experts = MagicMock()
-        self.shared_experts.gate_up_proj.return_value = (torch.randn(10, 128),
-                                                         torch.tensor(1.0))
-        self.shared_experts.act_fn.return_value = torch.randn(10, 128)
-        self.dispatcher.with_quant = False
-        self.dispatcher.shared_act = torch.randn(10, 128)
-        self.dispatcher.swiglu_out_scale = torch.tensor(1.0)
-        self.hidden_states = torch.randn(10, 128)
-        self.topk_weights = torch.randn(10, 1)
-
-        with patch("torch_npu.npu_moe_distribute_dispatch_v2",
-                   return_value=(torch.randn(10, 128), ) * 5 + (None, None)):
-            self.dispatcher.token_dispatch(self.hidden_states,
-                                           self.topk_weights,
-                                           torch.randint(0, 8, (10, 1)),
-                                           torch.tensor(
-                                               [0, 1, 2, 3, 4, 5, 6, 7]),
-                                           shared_experts=self.shared_experts)
-
    def test_get_combine_mc_kwargs_with_quant(self):
        self.dispatcher.with_quant = True
        hidden_states = torch.randn(10, 128)