[Kernel] Add moe_gating_top_k operator support for Ascend NPU (#5579)

### What this PR does / why we need it? 1.replace moe_gating_top_k from torch_npu with custom op 2.enable the renorm function of moe_gating_top_k in softmax scenerio ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? No need test - vLLM version: v0.13.0 - vLLM main: 7157596103 --------- Signed-off-by: ZCG12345 <2097562023@qq.com>
2026-01-07 21:42:31 +08:00
parent 1165b2c863
commit 3be8e33fe9
32 changed files with 4667 additions and 13 deletions
--- a/vllm_ascend/ops/fused_moe/experts_selector.py
+++ b/vllm_ascend/ops/fused_moe/experts_selector.py
@@ -17,7 +17,6 @@
 from typing import Callable, Optional

 import torch
-import torch_npu

 from vllm_ascend.utils import get_weight_prefetch_method

@@ -64,6 +63,7 @@ def select_experts(hidden_states: torch.Tensor,
    is_support_npu_moe_gating_top_k = check_npu_moe_gating_top_k(
        hidden_states=hidden_states,
        top_k=top_k,
+        renormalize=renormalize,
        topk_group=topk_group,
        num_expert_group=num_expert_group,
        scoring_func=scoring_func,
@@ -102,10 +102,13 @@ def select_experts(hidden_states: torch.Tensor,
 def check_npu_moe_gating_top_k(
        hidden_states: torch.Tensor,
        top_k: int,
+        renormalize: bool,
        topk_group: Optional[int] = None,
        num_expert_group: Optional[int] = None,
        scoring_func: str = "softmax",
        custom_routing_function: Optional[Callable] = None):
+    if scoring_func == "sigmoid" and not renormalize:  #sigmoid + renorm=0 is not supported in current branch
+        return False
    if custom_routing_function is not None:
        return False
    if scoring_func != "softmax" and scoring_func != "sigmoid":
@@ -209,26 +212,25 @@ def _select_experts_with_fusion_ops(

    topk_group = topk_group if topk_group is not None else 1
    num_expert_group = num_expert_group if num_expert_group is not None else 1
+    renorm = int(renormalize)
    norm_type = 0 if scoring_func == "softmax" else 1
    if e_score_correction_bias is not None and \
        e_score_correction_bias.dtype != router_logits.dtype:
        e_score_correction_bias = e_score_correction_bias.to(
            router_logits.dtype)
-    topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
+    topk_weights, topk_ids, _ = torch.ops._C_ascend.moe_gating_top_k(
        router_logits,
        k=top_k,
-        bias=e_score_correction_bias,
        k_group=topk_group,
        group_count=num_expert_group,
-        group_select_mode=1,  # 0: the maximum in the group; 1: topk2.sum(fix)
-        renorm=0,  # 0: softmax->topk(fix); 1: topk->softmax
+        group_select_mode=1,
+        renorm=renorm,
        norm_type=norm_type,  # 0: softmax; 1: sigmoid
-        # out_flag=False, # todo new api; should the third output be output
-        # y2_flag=False, # old api; should the third output be output
+        out_flag=False,
        routed_scaling_factor=routed_scaling_factor,
-        eps=float(1e-20))
-    if scoring_func == "softmax":
-        topk_weights = _renormalize_topk_weights(topk_weights, renormalize)
+        eps=float(1e-20),
+        bias_opt=e_score_correction_bias,
+    )

    return topk_weights, topk_ids