[Feature] Support fine-grained shared expert overlap (#5482)

Fine-grained control over shared expert overlap to prevent resource contention. - vLLM version: v0.13.0 - vLLM main: 5326c89803 --------- Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
2026-01-17 11:53:22 +08:00
parent 48e10de8c9
commit 22f253142a
9 changed files with 203 additions and 130 deletions
--- a/vllm_ascend/ops/fused_moe/token_dispatcher.py
+++ b/vllm_ascend/ops/fused_moe/token_dispatcher.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Optional

 import torch
 import torch_npu
@@ -82,9 +82,6 @@ class MoETokenDispatcher(ABC):
        expert_map: Optional[torch.Tensor] = None,
        log2phy: Optional[torch.Tensor] = None,
        global_redundant_expert_num: int = 0,
-        shared_experts: Optional[Any] = None,
-        quantized_x_for_share: Optional[Any] = None,
-        dynamic_scale_for_share: Optional[Any] = None,
        mc2_mask: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        with_quant: bool = False,
@@ -193,9 +190,6 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[Any] = None,
-                       quantized_x_for_share: Optional[Any] = None,
-                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
@@ -226,12 +220,10 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
            "ep_recv_counts": ep_recv_counts,
            "tp_recv_counts": tp_recv_counts,
            "assist_info_for_combine": assist_info_for_combine,
-            "shared_experts": shared_experts,
            "expand_scales": expand_scales
        }

        group_list_type = 0
-
        return TokenDispatchResult(hidden_states=expand_x,
                                   dynamic_scale=dynamic_scale,
                                   group_list=expert_token_nums,
@@ -297,7 +289,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
        combined_output = torch_npu.npu_moe_distribute_combine_v2(**kwargs_mc2) \
            if self.enable_dispatch_v2 else torch_npu.npu_moe_distribute_combine(**kwargs_mc2)

-        return TokenCombineResult(routed_out=combined_output)
+        return TokenCombineResult(routed_out=combined_output, )


 class TokenDispatcherWithAllGather(MoETokenDispatcher):
@@ -319,9 +311,6 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[Any] = None,
-                       quantized_x_for_share: Optional[Any] = None,
-                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,
@@ -442,9 +431,6 @@ class TokenDispatcherWithAll2AllV(MoETokenDispatcher):
                       expert_map: Optional[torch.Tensor] = None,
                       log2phy: Optional[torch.Tensor] = None,
                       global_redundant_expert_num: int = 0,
-                       shared_experts: Optional[Any] = None,
-                       quantized_x_for_share: Optional[Any] = None,
-                       dynamic_scale_for_share: Optional[Any] = None,
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False,