[bugfix] fix w8a8dynamic fused_moe trans nz (#5199)
### What this PR does / why we need it?
Currently, `torch_npu.npu_grouped_matmul_swiglu_quant` can only support
weight nz, so we need to trans w13_weight, w2_weight to nz forcely.
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.12.0
- vLLM main:
ad32e3e19c
Signed-off-by: zzzzwwjj <1183291235@qq.com>
This commit is contained in:
@@ -59,3 +59,48 @@ class TestAscendW8A8FusedMoEMethod(TestBase):
|
|||||||
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
|
self.assertEqual(param_dict["w13_weight_scale"].dtype, torch.bfloat16)
|
||||||
self.assertEqual(param_dict["w13_weight_scale"].shape,
|
self.assertEqual(param_dict["w13_weight_scale"].shape,
|
||||||
(self.num_experts, 2 * self.intermediate_size, 1))
|
(self.num_experts, 2 * self.intermediate_size, 1))
|
||||||
|
|
||||||
|
def build_layer(self):
|
||||||
|
layer = torch.nn.Module()
|
||||||
|
layer.w13_weight = torch.nn.Parameter(torch.empty(
|
||||||
|
self.num_experts,
|
||||||
|
2 * self.intermediate_size,
|
||||||
|
self.hidden_size,
|
||||||
|
dtype=torch.int8),
|
||||||
|
requires_grad=False)
|
||||||
|
layer.w2_weight = torch.nn.Parameter(torch.empty(
|
||||||
|
self.num_experts,
|
||||||
|
self.hidden_size,
|
||||||
|
self.intermediate_size,
|
||||||
|
dtype=torch.int8),
|
||||||
|
requires_grad=False)
|
||||||
|
w13_weight_scale = torch.zeros(
|
||||||
|
(self.num_experts, 2 * self.intermediate_size, 1),
|
||||||
|
dtype=torch.float32)
|
||||||
|
layer.w13_weight_scale = torch.nn.Parameter(w13_weight_scale,
|
||||||
|
requires_grad=False)
|
||||||
|
w13_weight_offset = torch.zeros(
|
||||||
|
(self.num_experts, 2 * self.intermediate_size, 1),
|
||||||
|
dtype=torch.float32)
|
||||||
|
layer.w13_weight_offset = torch.nn.Parameter(w13_weight_offset,
|
||||||
|
requires_grad=False)
|
||||||
|
w2_weight_scale = torch.zeros((self.num_experts, self.hidden_size, 1),
|
||||||
|
dtype=torch.float32)
|
||||||
|
layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
|
||||||
|
requires_grad=False)
|
||||||
|
w2_weight_offset = torch.zeros((self.num_experts, self.hidden_size, 1),
|
||||||
|
dtype=torch.float32)
|
||||||
|
layer.w2_weight_offset = torch.nn.Parameter(w2_weight_offset,
|
||||||
|
requires_grad=False)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
@patch('torch_npu.npu_format_cast')
|
||||||
|
def test_process_weights_after_loading(self, mock_npu_format_cast):
|
||||||
|
|
||||||
|
def func_by_args(weight, num_format):
|
||||||
|
return weight
|
||||||
|
|
||||||
|
mock_npu_format_cast.side_effect = func_by_args
|
||||||
|
new_layer = self.build_layer()
|
||||||
|
self.quant_method.process_weights_after_loading(new_layer)
|
||||||
|
mock_npu_format_cast.assert_called()
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ from vllm_ascend.ascend_forward_context import MoECommType
|
|||||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||||
from vllm_ascend.flash_common3_context import get_flash_common3_context
|
from vllm_ascend.flash_common3_context import get_flash_common3_context
|
||||||
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
|
||||||
from vllm_ascend.utils import maybe_trans_nz
|
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, maybe_trans_nz
|
||||||
|
|
||||||
|
|
||||||
class AscendW8A8DynamicLinearMethod:
|
class AscendW8A8DynamicLinearMethod:
|
||||||
@@ -276,8 +276,12 @@ class AscendW8A8DynamicFusedMoEMethod:
|
|||||||
1, 2).contiguous()
|
1, 2).contiguous()
|
||||||
layer.w2_weight.data = layer.w2_weight.data.transpose(1,
|
layer.w2_weight.data = layer.w2_weight.data.transpose(1,
|
||||||
2).contiguous()
|
2).contiguous()
|
||||||
layer.w13_weight.data = maybe_trans_nz(layer.w13_weight.data)
|
# TODO(zzzzwwjj): Currently, `torch_npu.npu_grouped_matmul_swiglu_quant`
|
||||||
layer.w2_weight.data = maybe_trans_nz(layer.w2_weight.data)
|
# can only support weight nz.
|
||||||
|
layer.w13_weight.data = torch_npu.npu_format_cast(
|
||||||
|
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||||
|
layer.w2_weight.data = torch_npu.npu_format_cast(
|
||||||
|
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||||
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
|
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
|
||||||
layer.w13_weight_scale.data.shape[0], -1)
|
layer.w13_weight_scale.data.shape[0], -1)
|
||||||
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
|
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
|
||||||
|
|||||||
Reference in New Issue
Block a user