[main] adapt usage of npu_moe_gating_top_k_softmax and remove envs.SELECT_GATING_TOPK_SOTFMAX_EXPERTS (#2112)

backport of v0.9.1-dev:
https://github.com/vllm-project/vllm-ascend/pull/1902

origin main npu_moe_gating_top_k_softmax:
https://github.com/vllm-project/vllm-ascend/pull/1355

- vLLM version: v0.10.0
- vLLM main:
055bd3978e

Signed-off-by: huangxialu <huangxialu1@huawei.com>
This commit is contained in:
huangxialu
2025-07-31 21:05:56 +08:00
committed by GitHub
parent e8660d7978
commit 9c9a7cd90b
5 changed files with 146 additions and 89 deletions

View File

@@ -297,9 +297,8 @@ class TestAscendUnquantizedFusedMoEMethod:
assert not layer.w13_weight.requires_grad
assert not layer.w2_weight.requires_grad
@pytest.mark.parametrize(
"others_param",
[[256, 4, False], [128, 1, False], [128, 1, True], [128, 4, False]])
@pytest.mark.parametrize("others_param",
[[256, 4], [128, 1], [128, 1], [128, 4]])
def test_apply_without_expert_map(self, moe_method, mock_dist_env,
mock_moe_env, others_param):
"""
@@ -308,15 +307,13 @@ class TestAscendUnquantizedFusedMoEMethod:
3 test use select_gating_topk_softmax_experts and fused_experts
4 test use select_experts and fused_experts_with_all2all_buffer
"""
global_num_experts, ep_size, select_softmax = others_param
global_num_experts, ep_size = others_param
is_prefill = False
is_deepseek_v3_r1 = global_num_experts == 256
forward_context = MagicMock(fused_moe_state=get_fused_moe_state(
ep_size, is_prefill, is_deepseek_v3_r1))
with patch(
"vllm_ascend.ops.fused_moe.SELECT_GATING_TOPK_SOTFMAX_EXPERTS",
select_softmax), \
patch("vllm_ascend.ops.fused_moe.get_forward_context", return_value=forward_context):
with patch("vllm_ascend.ops.fused_moe.get_forward_context",
return_value=forward_context):
moe_method.ep_size = ep_size
x = torch.randn(8, 2, 2)
router_logits = torch.randn(8, 8)