support mxfp8 quantization (qwen dense) (#5723)
### What this PR does / why we need it?
support mxfp8 quantization (qwen liner layer)
### Does this PR introduce _any_ user-facing change?
### How was this patch tested?
- vLLM version: v0.13.0
- vLLM main:
2f4e6548ef
Signed-off-by: wangyao <iwangyao@outlook.com>
This commit is contained in:
@@ -45,7 +45,7 @@ from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
|
||||
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, flashcomm2_enable,
|
||||
mlp_tp_enable, oproj_tp_enable)
|
||||
|
||||
from .utils import get_quant_method
|
||||
from .utils import get_quant_method, is_mx_quant_type
|
||||
|
||||
|
||||
@register_quantization_config(ASCEND_QUANTIZATION_METHOD)
|
||||
@@ -401,7 +401,8 @@ class AscendLinearMethod(LinearMethodBase):
|
||||
set_weight_attrs(param, {"output_dim": 0})
|
||||
layer.register_parameter(pergroup_name, param)
|
||||
set_weight_attrs(param, extra_weight_attrs)
|
||||
if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name:
|
||||
if "weight_scale_second" in pergroup_name or "weight_offset_second" in pergroup_name \
|
||||
or is_mx_quant_type(self.quant_method):
|
||||
setattr(param, "input_dim", 1)
|
||||
param.input_dim = 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user