[Perf]Enable npu_moe_gating_top_k_softmax on quantized scenarios (#2633)

### What this PR does / why we need it? This PR enables `npu_moe_gating_top_k_softmax` when running quantized MoE (such as W8A8). This op in fact makes no distinction between quantized and non-quantized scenarios. Introducing this op reduces 3~4ms for TPOT. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: ce30dca5c4 Signed-off-by: Angazenn <supperccell@163.com>
2025-09-03 09:14:17 +08:00
parent 24d4dad7b2
commit b84465c525
3 changed files with 33 additions and 15 deletions
--- a/vllm_ascend/ops/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe.py
@@ -173,8 +173,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            e_score_correction_bias=e_score_correction_bias,
-            global_num_experts=global_num_experts,
-            is_unquantized=True)
+            global_num_experts=global_num_experts)

        topk_weights = topk_weights.to(x.dtype)
        # this is a naive implementation for experts load balance so as
--- a/vllm_ascend/ops/layers/experts_selector.py
+++ b/vllm_ascend/ops/layers/experts_selector.py
@@ -43,7 +43,6 @@ def select_experts(hidden_states: torch.Tensor,
                   routed_scaling_factor=1.0,
                   e_score_correction_bias: Optional[torch.Tensor] = None,
                   indices_type: Optional[torch.dtype] = None,
-                   is_unquantized: bool = False,
                   global_num_experts: int = -1):
    """
    Fused experts with select experts.
@@ -60,7 +59,6 @@ def select_experts(hidden_states: torch.Tensor,
        scoring_func: Scoring function to use.
        e_score_correction_bias: Correction bias to apply to expert scores.
        indices_type: dtype of indices
-        is_unquantized: Whether the data are unquantized.
        global_num_experts: Global number of experts.

    Returns:
@@ -80,8 +78,7 @@ def select_experts(hidden_states: torch.Tensor,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
-        global_num_experts=global_num_experts,
-        is_unquantized=is_unquantized)
+        global_num_experts=global_num_experts)

    if topk_weights is None:
        topk_weights, topk_ids = _native_select_experts(
@@ -183,8 +180,7 @@ def _select_experts_with_fusion_ops(
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        routed_scaling_factor=1.0,
-        global_num_experts: int = -1,
-        is_unquantized: bool = False):
+        global_num_experts: int = -1):

    topk_weights, topk_ids, row_idx = None, None, None
    # NOTE: now npu_moe_gating_top_k can only support 'group_count=256' pattern
@@ -205,7 +201,7 @@ def _select_experts_with_fusion_ops(
            routed_scaling_factor=1,
            eps=float(1e-20))
        row_idx = return_row_idx(hidden_states, top_k)
-    if not use_grouped_topk and custom_routing_function is None and scoring_func == "softmax" and is_unquantized:
+    if not use_grouped_topk and custom_routing_function is None and scoring_func == "softmax":
        topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax(
            x=router_logits, finished=None, k=top_k)
        topk_ids = topk_ids.to(torch.int32)