【0.11.0-dev】optimization of kimi-k2 in cann8.3 (#4555)
### What this PR does / why we need it? In cann8.3, npu_moe_gating_top_k operator can support expert nums with 384, so kimi can use the operator to get better preformance. --------- Signed-off-by: Levi-JQ <yujinqi2@huawei.com> Co-authored-by: Levi-JQ <yujinqi2@huawei.com>
This commit is contained in:
@@ -383,7 +383,7 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
|
||||
else:
|
||||
assert result.shape == x.shape
|
||||
|
||||
@pytest.mark.parametrize("others_param", [16, 1, 4])
|
||||
@pytest.mark.parametrize("others_param", [16, 4])
|
||||
def test_apply_with_expert_map(self, moe_method, mock_dist_env,
|
||||
mock_moe_env, others_param):
|
||||
"""
|
||||
@@ -396,9 +396,18 @@ class TestTorchairAscendUnquantizedFusedMoEMethod:
|
||||
is_prefill = False
|
||||
forward_context = MagicMock(
|
||||
fused_moe_state=_get_fused_moe_state(ep_size, is_prefill, True))
|
||||
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
|
||||
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3):
|
||||
if ep_size == 1:
|
||||
top_k_return = (torch.randn(16, 2), torch.randint(0, 16,
|
||||
(16, 2)), None)
|
||||
expert_map = torch.tensor(
|
||||
[0, 1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
|
||||
else:
|
||||
top_k_return = (torch.randn(8, 2), torch.randint(0, 8,
|
||||
(8, 2)), None)
|
||||
expert_map = torch.tensor([0, 1, 2, -1, -1, -1, -1, -1])
|
||||
with patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_forward_context", return_value=forward_context), \
|
||||
patch("vllm_ascend.torchair.ops.torchair_fused_moe.get_ascend_soc_version", return_value=AscendSocVersion.A3), \
|
||||
patch('torch_npu.npu_moe_gating_top_k', return_value=top_k_return):
|
||||
moe_method.ep_size = ep_size
|
||||
x = torch.randn(8, 2, 2)
|
||||
if ep_size == 1:
|
||||
|
||||
Reference in New Issue
Block a user