[Qwen-moe] Remove the minor operation arange (#2373)

### What this PR does / why we need it?
Integrate the arange operator to reduce the time spent and improve
performance

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- vLLM version: v0.10.1.1
- vLLM main:
56dcf4e7e9

---------

Signed-off-by: s30076806 <songjiayang2@h-partners.com>
This commit is contained in:
s30076806
2025-08-27 09:13:31 +08:00
committed by GitHub
parent 358ba68994
commit 6a4ec186e7
9 changed files with 80 additions and 79 deletions

View File

@@ -719,12 +719,12 @@ class TestSelectExperts(TestBase):
def test_softmax_scoring(self):
"""Test softmax scoring function"""
weights, ids = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=False,
renormalize=False,
scoring_func="softmax")
weights, ids, _ = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=False,
renormalize=False,
scoring_func="softmax")
self.assertEqual(weights.shape, (self.num_tokens, self.top_k))
self.assertEqual(ids.shape, (self.num_tokens, self.top_k))
@@ -732,12 +732,12 @@ class TestSelectExperts(TestBase):
def test_sigmoid_scoring(self):
"""Test sigmoid scoring function"""
weights, ids = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=False,
renormalize=False,
scoring_func="sigmoid")
weights, ids, _ = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=False,
renormalize=False,
scoring_func="sigmoid")
self.assertEqual(weights.shape, (self.num_tokens, self.top_k))
self.assertEqual(ids.shape, (self.num_tokens, self.top_k))
@@ -760,13 +760,13 @@ class TestSelectExperts(TestBase):
self.top_k,
dtype=torch.long))
weights, ids = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=True,
renormalize=False,
topk_group=4,
num_expert_group=2)
weights, ids, _ = select_experts(hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
use_grouped_topk=True,
renormalize=False,
topk_group=4,
num_expert_group=2)
mock_topk.assert_called()
self.assertEqual(weights.shape, (self.num_tokens, self.top_k))
@@ -780,7 +780,7 @@ class TestSelectExperts(TestBase):
self.num_experts)
e_score_correction_bias = torch.randn(self.num_experts)
weights, ids = select_experts(
weights, ids, _ = select_experts(
hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
@@ -803,7 +803,7 @@ class TestSelectExperts(TestBase):
self.top_k,
dtype=torch.int32))
weights, ids = select_experts(
weights, ids, _ = select_experts(
hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
@@ -824,7 +824,7 @@ class TestSelectExperts(TestBase):
self.top_k,
dtype=torch.long))
weights, _ = select_experts(
weights, ids, _ = select_experts(
hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,
@@ -844,7 +844,7 @@ class TestSelectExperts(TestBase):
self.top_k,
dtype=torch.long))
weights, ids = select_experts(
weights, ids, _ = select_experts(
hidden_states=self.hidden_states,
router_logits=self.router_logits,
top_k=self.top_k,

View File

@@ -55,6 +55,12 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
torch.randn(self.num_tokens),
)
mock_moe_finalize_routing.return_value = self.placeholder
row_idx_len = self.num_tokens * 8
row_idx = (torch.arange(
0,
row_idx_len,
dtype=torch.int32,
).view(8, -1).permute(1, 0).contiguous())
result = fused_experts_with_all2all(
hidden_states=self.placeholder,
@@ -64,6 +70,7 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
w2_scale=self.placeholder,
topk_weights=self.placeholder,
topk_ids=self.placeholder,
row_idx=row_idx,
top_k=8,
expert_map=expert_map,
ep_group=ep_group,