【0.11.0-dev】optimization of kimi-k2 in cann8.3 (#4555)

### What this PR does / why we need it?
In cann8.3, npu_moe_gating_top_k operator can support expert nums with
384, so kimi can use the operator to get better preformance.
---------

Signed-off-by: Levi-JQ <yujinqi2@huawei.com>
Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
This commit is contained in:
Levi
2025-12-09 08:49:15 +08:00
committed by GitHub
parent 0d094531b4
commit 9862a23985
4 changed files with 54 additions and 96 deletions

View File

@@ -383,7 +383,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
else:
assert result.shape == x.shape
@pytest.mark.parametrize("others_param", [16, 1, 4])
@pytest.mark.parametrize("others_param", [16, 4])
def test_apply_with_expert_map(self, moe_method, mock_dist_env,
mock_moe_env, others_param):
"""
@@ -396,9 +396,18 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
is_prefill = False
forward_context = MagicMock(
fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
if ep_size == 1:
top_k_return = (torch.randn(16, 2), torch.randint(0, 16,
(16, 2)), None)
expert_map = torch.tensor(
[0, 1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
else:
top_k_return = (torch.randn(8, 2), torch.randint(0, 8,
(8, 2)), None)
expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
patch('torch_npu.npu_moe_gating_top_k', return_value=top_k_return):
moe_method.ep_size = ep_size
x = torch.randn(8, 2, 2)
if ep_size == 1: