【0.11.0-dev】optimization of kimi-k2 in cann8.3 (#4555)

### What this PR does / why we need it? In cann8.3， npu_moe_gating_top_k operator can support expert nums with 384, so kimi can use the operator to get better preformance. --------- Signed-off-by: Levi-JQ <yujinqi2@huawei.com> Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
2025-12-09 08:49:15 +08:00
parent 0d094531b4
commit 9862a23985
4 changed files with 54 additions and 96 deletions
--- a/tests/ut/torchair/ops/test_torchair_fused_moe.py
+++ b/tests/ut/torchair/ops/test_torchair_fused_moe.py
@@ -383,7 +383,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
            else:
                assert result.shape == x.shape

-    @pytest.mark.parametrize("others_param", [16, 1, 4])
+    @pytest.mark.parametrize("others_param", [16, 4])
    def test_apply_with_expert_map(self, moe_method, mock_dist_env,
                                   mock_moe_env, others_param):
        """
@@ -396,9 +396,18 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
        is_prefill = False
        forward_context = MagicMock(
            fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
-        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
-             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
+        if ep_size == 1:
+            top_k_return = (torch.randn(16, 2), torch.randint(0, 16,
+                                                              (16, 2)), None)
+            expert_map = torch.tensor(
+                [0, 1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
+        else:
+            top_k_return = (torch.randn(8, 2), torch.randint(0, 8,
+                                                             (8, 2)), None)
            expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
+        with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
+             patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
+             patch('torch_npu.npu_moe_gating_top_k', return_value=top_k_return):
            moe_method.ep_size = ep_size
            x = torch.randn(8, 2, 2)
            if ep_size == 1: