Remove unused row_idx in token_dispatcher (#3442)

### What this PR does / why we need it?
The `row_idx` parameter is no longer used since
PR[#2689](https://github.com/vllm-project/vllm-ascend/pull/2689), so
remove it across multiple files to remove unnecessary calculations and
parameter passing.

### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
accuracy test passed for Qwen3 235B and DeepSeek V3 671B after this PR.


- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

---------

Signed-off-by: CaranLic <740821011@qq.com>
This commit is contained in:
CaranLic
2025-10-15 09:08:31 +08:00
committed by GitHub
parent 3642b64afc
commit 15b2e5c995
11 changed files with 37 additions and 88 deletions

View File

@@ -21,17 +21,6 @@ import torch_npu
from vllm.forward_context import get_forward_context
def return_row_idx(hidden_states, top_k):
num_tokens = hidden_states.shape[0]
row_idx_len = num_tokens * top_k
row_idx = (torch.arange(0,
row_idx_len,
dtype=torch.int32,
device=hidden_states.device).view(
top_k, -1).permute(1, 0).contiguous())
return row_idx
def select_experts(hidden_states: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
@@ -71,7 +60,7 @@ def select_experts(hidden_states: torch.Tensor,
if weight_prefetch_method:
weight_prefetch_method.maybe_prefetch_moe_weight_preprocess(
hidden_states, "gate_up")
topk_weights, topk_ids, row_idx = _select_experts_with_fusion_ops(
topk_weights, topk_ids = _select_experts_with_fusion_ops(
hidden_states=hidden_states,
router_logits=router_logits,
top_k=top_k,
@@ -99,9 +88,7 @@ def select_experts(hidden_states: torch.Tensor,
e_score_correction_bias=e_score_correction_bias,
global_num_experts=global_num_experts,
)
if row_idx is None:
row_idx = return_row_idx(hidden_states, top_k)
return topk_weights, topk_ids, row_idx
return topk_weights, topk_ids
def _native_grouped_topk(
@@ -187,7 +174,7 @@ def _select_experts_with_fusion_ops(
routed_scaling_factor=1.0,
global_num_experts: int = -1):
topk_weights, topk_ids, row_idx = None, None, None
topk_weights, topk_ids = None, None
# NOTE: now npu_moe_gating_top_k can only support 'group_count=256' pattern
is_deepseek_v3_r1 = global_num_experts == 256
if is_deepseek_v3_r1:
@@ -205,14 +192,13 @@ def _select_experts_with_fusion_ops(
# y2_flag=False, # old api; should the third output be output
routed_scaling_factor=1,
eps=float(1e-20))
row_idx = return_row_idx(hidden_states, top_k)
if not use_grouped_topk and custom_routing_function is None and scoring_func == "softmax":
topk_weights, topk_ids, row_idx = torch_npu.npu_moe_gating_top_k_softmax(
topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax(
x=router_logits, finished=None, k=top_k)
topk_ids = topk_ids.to(torch.int32)
topk_weights = _renormalize_topk_weights(topk_weights, renormalize)
return topk_weights, topk_ids, row_idx
return topk_weights, topk_ids
def _native_select_experts(