[v0.18.0][Bugfix]Fix Error "AttributeError: 'AscendCompressedTensorsConfig' obiect has no attribute 'enabling_fa_quant'" (#7748)
### What this PR does / why we need it? cherry-pick from https://github.com/vllm-project/vllm-ascend/pull/7736 **Error information** When the quantized weights in CompressedTensors format of the kimi-k2 model are used, the following error is reported: `AttributeError: 'AscendCompressedTensorsConfig' obiect has no attribute 'enabling_fa_quant'` **Error Cause** Currently, FA3 quantization supports only the weights of modelslim quantization. The added methods are not defined in AscendCompressedTensorsConfig. **Solution** Before invoking related methods, check whether the FA3 feature is enabled. Additionally, the unused `get_scaled_act_names` method and its corresponding unit test have been removed. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing unit tests were updated by removing a deprecated test case, and the refactored logic was reviewed for correctness. Signed-off-by: Wang Kunpeng <1289706727@qq.com>
This commit is contained in:
@@ -253,6 +253,3 @@ class TestAscendModelSlimConfig(TestBase):
|
|||||||
self.assertIn("model.layers.0.weight", config.quant_description)
|
self.assertIn("model.layers.0.weight", config.quant_description)
|
||||||
self.assertEqual(config.quant_description["model.layers.0.weight"],
|
self.assertEqual(config.quant_description["model.layers.0.weight"],
|
||||||
"INT8")
|
"INT8")
|
||||||
|
|
||||||
def test_get_scaled_act_names(self):
|
|
||||||
self.assertEqual(self.ascend_config.get_scaled_act_names(), [])
|
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ from vllm_ascend.ops.layer_shard_linear import (
|
|||||||
)
|
)
|
||||||
from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
|
from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
|
||||||
from vllm_ascend.quantization.methods.w8a8_static import AscendW8A8LinearMethod
|
from vllm_ascend.quantization.methods.w8a8_static import AscendW8A8LinearMethod
|
||||||
|
from vllm_ascend.quantization.utils import enable_fa_quant
|
||||||
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, get_weight_prefetch_method, maybe_trans_nz, weak_ref_tensors
|
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, get_weight_prefetch_method, maybe_trans_nz, weak_ref_tensors
|
||||||
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
from vllm_ascend.worker.npu_input_batch import NPUInputBatch
|
||||||
|
|
||||||
@@ -730,10 +731,7 @@ class AscendMLAImpl(MLAAttentionImpl):
|
|||||||
self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
self.vllm_config.kv_transfer_config is not None and self.vllm_config.kv_transfer_config.is_kv_producer
|
||||||
)
|
)
|
||||||
self.layer_name = kwargs.get("layer_name")
|
self.layer_name = kwargs.get("layer_name")
|
||||||
quant_config = self.vllm_config.quant_config
|
self.fa_quant_layer = enable_fa_quant(self.vllm_config, self.layer_name)
|
||||||
self.fa_quant_layer = (
|
|
||||||
quant_config.enabling_fa_quant(self.vllm_config, self.layer_name) if quant_config is not None else False
|
|
||||||
)
|
|
||||||
self.dtype = torch.int8 if self.fa_quant_layer else self.vllm_config.model_config.dtype
|
self.dtype = torch.int8 if self.fa_quant_layer else self.vllm_config.model_config.dtype
|
||||||
self.layer_sharding_kwargs = []
|
self.layer_sharding_kwargs = []
|
||||||
for layer_name in get_ascend_config().layer_sharding or []:
|
for layer_name in get_ascend_config().layer_sharding or []:
|
||||||
|
|||||||
@@ -660,9 +660,6 @@ class AscendModelSlimConfig(QuantizationConfig):
|
|||||||
extra_quant_dict[new_k] = self.quant_description[k]
|
extra_quant_dict[new_k] = self.quant_description[k]
|
||||||
self.quant_description.update(extra_quant_dict)
|
self.quant_description.update(extra_quant_dict)
|
||||||
|
|
||||||
def get_scaled_act_names(self) -> list[str]:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def _add_kvcache_quant_metadata(self):
|
def _add_kvcache_quant_metadata(self):
|
||||||
fa_quant_type = self.quant_description.get("fa_quant_type", "")
|
fa_quant_type = self.quant_description.get("fa_quant_type", "")
|
||||||
self.enable_fa_quant = fa_quant_type != ""
|
self.enable_fa_quant = fa_quant_type != ""
|
||||||
|
|||||||
@@ -197,3 +197,12 @@ def maybe_auto_detect_quantization(vllm_config) -> None:
|
|||||||
from vllm.config import VllmConfig as _VllmConfig
|
from vllm.config import VllmConfig as _VllmConfig
|
||||||
|
|
||||||
vllm_config.quant_config = _VllmConfig._get_quantization_config(model_config, vllm_config.load_config)
|
vllm_config.quant_config = _VllmConfig._get_quantization_config(model_config, vllm_config.load_config)
|
||||||
|
|
||||||
|
|
||||||
|
def enable_fa_quant(vllm_config, layer_name=None) -> bool:
|
||||||
|
if vllm_config.quant_config is not None and getattr(vllm_config.quant_config, "enable_fa_quant", False):
|
||||||
|
if layer_name is not None:
|
||||||
|
return vllm_config.quant_config.enabling_fa_quant(vllm_config, layer_name)
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|||||||
@@ -109,6 +109,7 @@ from vllm_ascend.eplb.utils import model_register
|
|||||||
from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
|
from vllm_ascend.ops.rotary_embedding import set_cos_and_sin, update_cos_sin
|
||||||
from vllm_ascend.patch.worker.patch_draft_quarot import patch_load_weights
|
from vllm_ascend.patch.worker.patch_draft_quarot import patch_load_weights
|
||||||
from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
|
from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
|
||||||
|
from vllm_ascend.quantization.utils import enable_fa_quant
|
||||||
from vllm_ascend.sample.sampler import AscendSampler
|
from vllm_ascend.sample.sampler import AscendSampler
|
||||||
from vllm_ascend.spec_decode import get_spec_decode_method
|
from vllm_ascend.spec_decode import get_spec_decode_method
|
||||||
from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer
|
from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer
|
||||||
@@ -2763,7 +2764,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
k_dim,
|
k_dim,
|
||||||
v_dim,
|
v_dim,
|
||||||
]
|
]
|
||||||
if self.is_kv_consumer and self.vllm_config.quant_config is not None:
|
if self.is_kv_consumer and enable_fa_quant(self.vllm_config):
|
||||||
k_tensor_split_factor, v_tensor_split_factor = (
|
k_tensor_split_factor, v_tensor_split_factor = (
|
||||||
self.vllm_config.quant_config.get_kv_quant_split_factor(layer_name, kv_head_dim_list)
|
self.vllm_config.quant_config.get_kv_quant_split_factor(layer_name, kv_head_dim_list)
|
||||||
)
|
)
|
||||||
@@ -2950,7 +2951,7 @@ class NPUModelRunner(GPUModelRunner):
|
|||||||
v_dim,
|
v_dim,
|
||||||
)
|
)
|
||||||
k_cache_dtype = v_cache_dtype = current_kv_cache_spec.dtype
|
k_cache_dtype = v_cache_dtype = current_kv_cache_spec.dtype
|
||||||
if self.is_kv_consumer and self.vllm_config.quant_config is not None:
|
if self.is_kv_consumer and enable_fa_quant(self.vllm_config):
|
||||||
k_cache_dtype, v_cache_dtype = self.vllm_config.quant_config.get_kv_quant_dtype(
|
k_cache_dtype, v_cache_dtype = self.vllm_config.quant_config.get_kv_quant_dtype(
|
||||||
layer_name, current_kv_cache_spec.dtype, self.model_config
|
layer_name, current_kv_cache_spec.dtype, self.model_config
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user