[bugfix] fix w8a8dynamic fused_moe trans nz (#5199)

### What this PR does / why we need it?
Currently, `torch_npu.npu_grouped_matmul_swiglu_quant` can only support
weight nz, so we need to trans w13_weight, w2_weight to nz forcely.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c

Signed-off-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
zzzzwwjj
2025-12-22 17:45:34 +08:00
committed by GitHub
parent 55beac9c91
commit 052e472453
2 changed files with 52 additions and 3 deletions

View File

@@ -59,3 +59,48 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
self.assertEqual(param_dict["w13_weight_scale"].shape,
(self.num_experts, 2 * self.intermediate_size, 1))
def build_layer(self):
layer = torch.nn.Module()
layer.w13_weight = torch.nn.Parameter(torch.empty(
self.num_experts,
2 * self.intermediate_size,
self.hidden_size,
dtype=torch.int8),
requires_grad=False)
layer.w2_weight = torch.nn.Parameter(torch.empty(
self.num_experts,
self.hidden_size,
self.intermediate_size,
dtype=torch.int8),
requires_grad=False)
w13_weight_scale = torch.zeros(
(self.num_experts, 2 * self.intermediate_size, 1),
dtype=torch.float32)
layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
requires_grad=False)
w13_weight_offset = torch.zeros(
(self.num_experts, 2 * self.intermediate_size, 1),
dtype=torch.float32)
layer.w13_weight_offset = torch.nn.Parameter(w13_weight_offset,
requires_grad=False)
w2_weight_scale = torch.zeros((self.num_experts, self.hidden_size, 1),
dtype=torch.float32)
layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
requires_grad=False)
w2_weight_offset = torch.zeros((self.num_experts, self.hidden_size, 1),
dtype=torch.float32)
layer.w2_weight_offset = torch.nn.Parameter(w2_weight_offset,
requires_grad=False)
return layer
@patch('torch_npu.npu_format_cast')
def test_process_weights_after_loading(self, mock_npu_format_cast):
def func_by_args(weight, num_format):
return weight
mock_npu_format_cast.side_effect = func_by_args
new_layer = self.build_layer()
self.quant_method.process_weights_after_loading(new_layer)
mock_npu_format_cast.assert_called()