fix nz for quantization (#4943)
quantization ops rely on NZ by force, we should remove the nz check for it. Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -27,7 +27,7 @@ from vllm.forward_context import get_forward_context
|
|||||||
from vllm_ascend.ascend_config import get_ascend_config
|
from vllm_ascend.ascend_config import get_ascend_config
|
||||||
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
from vllm_ascend.distributed.parallel_state import get_mc2_group
|
||||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
|
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ
|
||||||
|
|
||||||
|
|
||||||
class AscendW4A8DynamicLinearMethod:
|
class AscendW4A8DynamicLinearMethod:
|
||||||
@@ -482,10 +482,9 @@ class AscendW4A8DynamicFusedMoEMethod:
|
|||||||
|
|
||||||
self.update_bias(layer, w13_bias, w2_bias)
|
self.update_bias(layer, w13_bias, w2_bias)
|
||||||
|
|
||||||
if is_enable_nz():
|
layer.w13_weight.data = torch_npu.npu_format_cast(
|
||||||
layer.w13_weight.data = torch_npu.npu_format_cast(
|
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||||
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
layer.w2_weight.data = torch_npu.npu_format_cast(
|
||||||
layer.w2_weight.data = torch_npu.npu_format_cast(
|
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||||
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
|
|
||||||
layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
|
layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
|
||||||
layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
|
layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
|
||||||
|
|||||||
@@ -347,7 +347,7 @@ class AscendW8A8FusedMoEMethod:
|
|||||||
# converting ACL_FORMAT_FRACTAL_NZ.
|
# converting ACL_FORMAT_FRACTAL_NZ.
|
||||||
# npu_quant_grouped_matmul_dequant in eager mode does not accept
|
# npu_quant_grouped_matmul_dequant in eager mode does not accept
|
||||||
# ACL_FORMAT_FRACTAL_NZ.
|
# ACL_FORMAT_FRACTAL_NZ.
|
||||||
if not is_310p() and is_enable_nz():
|
if not is_310p():
|
||||||
layer.w13_weight.data = torch_npu.npu_format_cast(
|
layer.w13_weight.data = torch_npu.npu_format_cast(
|
||||||
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
|
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
|
||||||
layer.w2_weight.data = torch_npu.npu_format_cast(
|
layer.w2_weight.data = torch_npu.npu_format_cast(
|
||||||
|
|||||||
@@ -270,9 +270,8 @@ class AscendW8A8DynamicFusedMoEMethod:
|
|||||||
1, 2).contiguous()
|
1, 2).contiguous()
|
||||||
layer.w2_weight.data = layer.w2_weight.data.transpose(
|
layer.w2_weight.data = layer.w2_weight.data.transpose(
|
||||||
1, 2).contiguous()
|
1, 2).contiguous()
|
||||||
if is_enable_nz():
|
torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
|
||||||
torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
|
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
|
||||||
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
|
|
||||||
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
|
layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
|
||||||
layer.w13_weight_scale.data.shape[0], -1)
|
layer.w13_weight_scale.data.shape[0], -1)
|
||||||
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
|
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
|
||||||
|
|||||||
@@ -81,10 +81,9 @@ def is_enable_nz(dtype: Optional[torch.dtype] = torch.int8,
|
|||||||
"vllm_config must be provided when _ENABLE_NZ is None")
|
"vllm_config must be provided when _ENABLE_NZ is None")
|
||||||
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"
|
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"
|
||||||
|
|
||||||
_IS_EAGLE_MODE = (
|
_IS_EAGLE_MODE = (vllm_config.speculative_config is not None
|
||||||
vllm_config.speculative_config is not None and
|
and getattr(vllm_config.speculative_config, 'method',
|
||||||
getattr(vllm_config.speculative_config, 'method', None) in ("eagle", "eagle3")
|
None) in ("eagle", "eagle3"))
|
||||||
)
|
|
||||||
|
|
||||||
if dtype in [torch.float16, torch.bfloat16]:
|
if dtype in [torch.float16, torch.bfloat16]:
|
||||||
return _ENABLE_NZ if _IS_EAGLE_MODE else False
|
return _ENABLE_NZ if _IS_EAGLE_MODE else False
|
||||||
|
|||||||
Reference in New Issue
Block a user