[Refactor] Adjustments to moe_comm_method selection process (#3001)

### What this PR does / why we need it? Fix issues mentioned in https://github.com/vllm-project/vllm-ascend/pull/2791 and some minor refactoring. 1. Use Enum instead of string. 2. Avoid setting a new property to forward_context in AscendFusedMoE.forward(). 3. Enabling TokenDispatcherWithMoge. 4. Remove redundant code. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Qwen3-30B-A3B/Qwen3-30B-A3B-W8A8/DeepSeek-V3-W4A8-Pruing/deepseek-mtp/pangu-pro-moe-pruing: 1. Enable/Disable EP 2. Aclgraph & eager - vLLM version: v0.10.2 - vLLM main: 9607d5eb44 Signed-off-by: Pr0Wh1teGivee <calvin_zhu0210@outlook.com> Co-authored-by: weijinqian0 <12153182+weijinqian0@users.noreply.github.com>
2025-09-22 19:12:58 +08:00
parent bb1f0d5a62
commit 37a0715eda
14 changed files with 170 additions and 351 deletions
--- a/vllm_ascend/ops/moe/token_dispatcher.py
+++ b/vllm_ascend/ops/moe/token_dispatcher.py
@@ -377,14 +377,13 @@ class TokenDispatcherWithAllGather(MoETokenDispatcher):


 # mypy: disable-error-code="override"
-class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
+class TokenDispatcherWithMoge(MoETokenDispatcher):

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.apply_router_weight_on_input = False
-        self.local_ep = 1
-        self.local_num_experts = self.num_experts // self.local_ep
-        self.local_num_group = self.top_k // self.local_ep
+        self.local_num_experts = self.num_experts // self.ep_size
+        self.local_num_group = self.top_k // self.ep_size
        self.bsz = None

    def token_dispatch(self,
@@ -401,17 +400,6 @@ class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
                       mc2_mask: Optional[torch.Tensor] = None,
                       apply_router_weight_on_input: bool = False,
                       with_quant: bool = False):
-        self.apply_router_weight_on_input = apply_router_weight_on_input
-        if self.apply_router_weight_on_input:
-            assert (topk_weights.dim() == 2
-                    ), "`topk_weights` should be in shape (num_tokens, topk)"
-            _, topk = topk_weights.shape
-            assert (
-                topk == 1
-            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-            hidden_states = hidden_states * \
-                topk_weights.to(hidden_states.dtype)
-
        self.bsz, _ = hidden_states.shape
        flatten_topk_ids = topk_ids.view(-1)
        self.sorted_topk_ids = torch.argsort(flatten_topk_ids.float())
@@ -445,7 +433,7 @@ class UnquantizedTokenDispatcherWithFusedExpertsMoge(MoETokenDispatcher):
        unsorted_hidden_states = hidden_states.index_select(
            0, unsorted_topk_ids)
        final_hidden_states = unsorted_hidden_states.reshape(
-            self.bsz, self.top_k // self.local_ep, -1).sum(1)
+            self.bsz, self.top_k // self.ep_size, -1).sum(1)
        return final_hidden_states