[Qwen-moe] Remove the minor operation arange (#2373)

### What this PR does / why we need it? Integrate the arange operator to reduce the time spent and improve performance ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 56dcf4e7e9 --------- Signed-off-by: s30076806 <songjiayang2@h-partners.com>
2025-08-27 09:13:31 +08:00
parent 358ba68994
commit 6a4ec186e7
9 changed files with 80 additions and 79 deletions
--- a/tests/ut/quantization/test_w8a8_dynamic.py
+++ b/tests/ut/quantization/test_w8a8_dynamic.py
@@ -55,6 +55,12 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
            torch.randn(self.num_tokens),
        )
        mock_moe_finalize_routing.return_value = self.placeholder
+        row_idx_len = self.num_tokens * 8
+        row_idx = (torch.arange(
+            0,
+            row_idx_len,
+            dtype=torch.int32,
+        ).view(8, -1).permute(1, 0).contiguous())

        result = fused_experts_with_all2all(
            hidden_states=self.placeholder,
@@ -64,6 +70,7 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
            w2_scale=self.placeholder,
            topk_weights=self.placeholder,
            topk_ids=self.placeholder,
+            row_idx=row_idx,
            top_k=8,
            expert_map=expert_map,
            ep_group=ep_group,