[Qwen-moe] Remove the minor operation arange (#2373)

### What this PR does / why we need it? Integrate the arange operator to reduce the time spent and improve performance ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 56dcf4e7e9 --------- Signed-off-by: s30076806 <songjiayang2@h-partners.com>
2025-08-27 09:13:31 +08:00
parent 358ba68994
commit 6a4ec186e7
9 changed files with 80 additions and 79 deletions
--- a/vllm_ascend/ops/layers/experts_selector.py
+++ b/vllm_ascend/ops/layers/experts_selector.py
@@ -20,6 +20,17 @@ import torch
 import torch_npu


+def return_row_idx(hidden_states, top_k):
+    num_tokens = hidden_states.shape[0]
+    row_idx_len = num_tokens * top_k
+    row_idx = (torch.arange(0,
+                            row_idx_len,
+                            dtype=torch.int32,
+                            device=hidden_states.device).view(
+                                top_k, -1).permute(1, 0).contiguous())
+    return row_idx
+
+
 def select_experts(hidden_states: torch.Tensor,
                   router_logits: torch.Tensor,
                   top_k: int,
@@ -56,7 +67,8 @@ def select_experts(hidden_states: torch.Tensor,
        topk_ids: selected expert IDs of shape (num_tokens, top_k).
    """

-    topk_weights, topk_ids = _select_experts_with_fusion_ops(
+    topk_weights, topk_ids, row_idx = _select_experts_with_fusion_ops(
+        hidden_states=hidden_states,
        router_logits=router_logits,
        top_k=top_k,
        use_grouped_topk=use_grouped_topk,
@@ -83,7 +95,9 @@ def select_experts(hidden_states: torch.Tensor,
            e_score_correction_bias=e_score_correction_bias,
            global_num_experts=global_num_experts,
        )
-    return topk_weights, topk_ids
+    if row_idx is None:
+        row_idx = return_row_idx(hidden_states, top_k)
+    return topk_weights, topk_ids, row_idx


 def _native_grouped_topk(
@@ -156,6 +170,7 @@ def _select_expert_use_group_topk(


 def _select_experts_with_fusion_ops(
+        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
        use_grouped_topk: bool,
@@ -168,7 +183,7 @@ def _select_experts_with_fusion_ops(
        global_num_experts: int = -1,
        is_unquantized: bool = False):

-    topk_weights, topk_ids = None, None
+    topk_weights, topk_ids, row_idx = None, None, None
    # NOTE: now npu_moe_gating_top_k can only support 'group_count=256' pattern
    is_deepseek_v3_r1 = global_num_experts == 256
    if is_deepseek_v3_r1:
@@ -186,14 +201,14 @@ def _select_experts_with_fusion_ops(
            # y2_flag=False, # old api; should the third output be output
            routed_scaling_factor=1,
            eps=float(1e-20))
-
+        row_idx = return_row_idx(hidden_states, top_k)
    if not use_grouped_topk and custom_routing_function is None and scoring_func == "softmax" and is_unquantized:
-        topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax(
+        topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax(
            x=router_logits, finished=None, k=top_k)
        topk_ids = topk_ids.to(torch.int32)
        topk_weights = _renormalize_topk_weights(topk_weights, renormalize)

-    return topk_weights, topk_ids
+    return topk_weights, topk_ids, row_idx


 def _native_select_experts(