Remove qwen3 moe MC2 cumsum & cast (#3126)

What this PR does / why we need it?
The Qwen3 moe MC2 graph currently has two redundant computational
operator implementations. After npu_moe_distribute_dispatch_v2, the
cumsum and cast operations have been added. By using
expert_token_nums_type=0 and not converting weight_scale to float32,
these two operators can be eliminated, thereby improving inference
performance.

Does this PR introduce any user-facing change?
No

How was this patch tested?
No need

vLLM version: v0.10.2
vLLM main:
f225ea7dd9

- vLLM version: v0.10.2
- vLLM main:
f225ea7dd9

---------

Signed-off-by: florenceCH <gaoxiang120@huawei.com>
Co-authored-by: florenceCH <gaoxiang120@huawei.com>
This commit is contained in:
florenceCH
2025-09-26 08:51:30 +08:00
committed by GitHub
parent 2930e4a6bd
commit 14497b748d
3 changed files with 5 additions and 4 deletions

View File

@@ -98,7 +98,7 @@ class TestTokenDispatcherWithMC2(TestBase):
self.row_idx, expert_map)
mock_dispatch.assert_called_once()
self.assertEqual(output["group_list_type"],
1) # group_list_type == 1
0) # group_list_type == 0
def test_token_dispatch_with_shared_experts_and_quant(self):
self.shared_experts = MagicMock()

View File

@@ -79,8 +79,6 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
is_mc2 = get_forward_context().moe_comm_type == MoECommType.MC2
if w1_scale_bias is None and is_mc2:
if w1_scale.dtype != torch.float32:
w1_scale = w1_scale.to(torch.float32)
if fusion:
# gmm1: gate_up_proj & act_fn: swiglu
hidden_states, swiglu_out_scale, _ = torch_npu.npu_grouped_matmul_swiglu_quant(
@@ -90,6 +88,8 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
weight_scale=w1_scale,
x_scale=pertoken_scale)
else:
if w1_scale.dtype != torch.float32:
w1_scale = w1_scale.to(torch.float32)
# gmm1: gate_up_proj
hidden_states = torch_npu.npu_grouped_matmul(
x=[hidden_states],

View File

@@ -133,6 +133,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
"shared_expert_rank_num": 0,
"moe_expert_num": moe_expert_num,
"global_bs": 0,
"expert_token_nums_type": 0,
}
stage1_kwargs = {
@@ -204,7 +205,7 @@ class TokenDispatcherWithMC2(MoETokenDispatcher):
if shared_experts is not None:
shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
self.shared_act = shared_experts.act_fn(shared_gate_up)
group_list_type = 1
group_list_type = 0
return {
"group_list_type": group_list_type,
"hidden_states": expand_x,