[Kernel] Add moe_gating_top_k operator support for Ascend NPU (#5579)

### What this PR does / why we need it?

1.replace moe_gating_top_k from torch_npu with custom op
2.enable the  renorm function of moe_gating_top_k in softmax scenerio

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
No need test

- vLLM version: v0.13.0
- vLLM main:
7157596103

---------

Signed-off-by: ZCG12345 <2097562023@qq.com>
This commit is contained in:
ZCG12345
2026-01-07 21:42:31 +08:00
committed by GitHub
parent 1165b2c863
commit 3be8e33fe9
32 changed files with 4667 additions and 13 deletions

View File

@@ -17,7 +17,6 @@
from typing import Callable, Optional
import torch
import torch_npu
from vllm_ascend.utils import get_weight_prefetch_method
@@ -64,6 +63,7 @@ def select_experts(hidden_states: torch.Tensor,
is_support_npu_moe_gating_top_k = check_npu_moe_gating_top_k(
hidden_states=hidden_states,
top_k=top_k,
renormalize=renormalize,
topk_group=topk_group,
num_expert_group=num_expert_group,
scoring_func=scoring_func,
@@ -102,10 +102,13 @@ def select_experts(hidden_states: torch.Tensor,
def check_npu_moe_gating_top_k(
hidden_states: torch.Tensor,
top_k: int,
renormalize: bool,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
scoring_func: str = "softmax",
custom_routing_function: Optional[Callable] = None):
if scoring_func == "sigmoid" and not renormalize: #sigmoid + renorm=0 is not supported in current branch
return False
if custom_routing_function is not None:
return False
if scoring_func != "softmax" and scoring_func != "sigmoid":
@@ -209,26 +212,25 @@ def _select_experts_with_fusion_ops(
topk_group = topk_group if topk_group is not None else 1
num_expert_group = num_expert_group if num_expert_group is not None else 1
renorm = int(renormalize)
norm_type = 0 if scoring_func == "softmax" else 1
if e_score_correction_bias is not None and \
e_score_correction_bias.dtype != router_logits.dtype:
e_score_correction_bias = e_score_correction_bias.to(
router_logits.dtype)
topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
topk_weights, topk_ids, _ = torch.ops._C_ascend.moe_gating_top_k(
router_logits,
k=top_k,
bias=e_score_correction_bias,
k_group=topk_group,
group_count=num_expert_group,
group_select_mode=1, # 0: the maximum in the group; 1: topk2.sum(fix)
renorm=0, # 0: softmax->topk(fix); 1: topk->softmax
group_select_mode=1,
renorm=renorm,
norm_type=norm_type, # 0: softmax; 1: sigmoid
# out_flag=False, # todo new api; should the third output be output
# y2_flag=False, # old api; should the third output be output
out_flag=False,
routed_scaling_factor=routed_scaling_factor,
eps=float(1e-20))
if scoring_func == "softmax":
topk_weights = _renormalize_topk_weights(topk_weights, renormalize)
eps=float(1e-20),
bias_opt=e_score_correction_bias,
)
return topk_weights, topk_ids