diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index fbc1dc68..2d8834b1 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -25,7 +25,7 @@ from vllm.logger import logger from vllm.platforms import Platform, PlatformEnum from vllm.utils import supports_dynamo -from vllm_ascend.utils import update_aclgraph_sizes +from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes CUSTOM_OP_ENABLED = False try: @@ -60,7 +60,7 @@ class NPUPlatform(Platform): device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES" dispatch_key: str = "PrivateUse1" - supported_quantization: list[str] = ["ascend"] + supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD] def is_sleep_mode_available(self) -> bool: return True @@ -73,6 +73,15 @@ class NPUPlatform(Platform): from vllm_ascend.utils import adapt_patch adapt_patch(is_global_patch=True) + # For online serving, "ascend" quantization method is not a choice natively, + # so we need to add "ascend" quantization method to quantization methods list + # and the user can enable quantization using "vllm serve --quantization ascend". + if parser is not None: + quant_action = parser._option_string_actions.get('--quantization') + if quant_action and hasattr(quant_action, 'choices'): + if ASCEND_QUATIZATION_METHOD not in quant_action.choices: + quant_action.choices.append(ASCEND_QUATIZATION_METHOD) + from vllm_ascend.quantization.quant_config import \ AscendQuantConfig # noqa: F401 diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 1aededd3..3b0d0c44 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -38,11 +38,12 @@ from vllm.model_executor.parameter import PerTensorScaleParameter from vllm.model_executor.utils import set_weight_attrs from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod +from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD from .quantizer import AscendQuantizer -@register_quantization_config("ascend") +@register_quantization_config(ASCEND_QUATIZATION_METHOD) class AscendQuantConfig(QuantizationConfig): """Config class for Ascend @@ -58,7 +59,7 @@ class AscendQuantConfig(QuantizationConfig): @classmethod def get_name(cls) -> str: - return "ascend" + return ASCEND_QUATIZATION_METHOD @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: @@ -81,7 +82,7 @@ class AscendQuantConfig(QuantizationConfig): def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: if torch.npu.is_available(): - return "ascend" + return ASCEND_QUATIZATION_METHOD return None def get_quant_method(self, layer: torch.nn.Module, diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 6a750da3..cd83faed 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -38,6 +38,8 @@ else: # Maximum number of graphs that can be captured by ACL Graph MAX_CAPTURE_SIZE = 1920 +ASCEND_QUATIZATION_METHOD = "ascend" + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib