allgather use fusedop. (#2689)

### What this PR does / why we need it? Use 'npu_moe_init_routing_v2' &'npu_moe_token_unpermute' repalce 'npu_moe_init_routing' &‘npu_moe_compute_expert_tokens’& 'npu_moe_finalize_routing' to optimize performance ### Does this PR introduce _any_ user-facing change? | branch| tps| TTFT |TPOT | | --- | --- | --- |--- | |main |733.98 | 280.05 |34.30 | |main+fusedop | 740.33 | 273.34 |33.99 | ### How was this patch tested? - vLLM version: v0.10.1.1 - vLLM main: 6997a25ac6 Signed-off-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com> Co-authored-by: wangxiaoxin-sherie <wangxiaoxin7@huawei.com>
2025-09-04 11:56:29 +08:00
parent 7d47d8f4f6
commit f86596a66c
3 changed files with 66 additions and 160 deletions
--- a/tests/e2e/singlecard/ops/test_fused_moe.py
+++ b/tests/e2e/singlecard/ops/test_fused_moe.py
@@ -33,7 +33,7 @@ from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
    TokenDispatcherWithAllGather

 NUM_EXPERTS = [8, 64]
-EP_SIZE = [1, 4]
+EP_SIZE = [1]
 TOP_KS = [2, 6]
 DEVICE = ["npu"]

@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
    w1_local = w1
    w2_local = w2

-    if ep_size > 1:
-        local_e = e // ep_size
-        e_ids = torch.arange(local_e * 0,
-                             local_e * (0 + 1),
-                             device=device,
-                             dtype=torch.int32)
-        expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
-        expert_map[e_ids] = torch.arange(local_e,
-                                         device=device,
-                                         dtype=torch.int32)
-        w1_local = w1[e_ids]
-        w2_local = w2[e_ids]
-
    score = torch.softmax(score, dim=-1, dtype=dtype)
    topk_weights, topk_ids = torch.topk(score, topk)
    topk_ids = topk_ids.to(torch.int32)