enable online serving quantization (#877)

For online serving, "ascend" quantization method is not a choice natively, so we need to add "ascend" quantization method to quantization methods list and the user can enable quantization using "vllm serve --quantization ascend" command. --------- Signed-off-by: 22dimensions <waitingwind@foxmail.com>
2025-05-17 17:36:04 +08:00
parent a8730e7a3c
commit 00e0243561
3 changed files with 17 additions and 5 deletions
--- a/vllm_ascend/quantization/quant_config.py
+++ b/vllm_ascend/quantization/quant_config.py
@@ -38,11 +38,12 @@ from vllm.model_executor.parameter import PerTensorScaleParameter
 from vllm.model_executor.utils import set_weight_attrs

 from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
+from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD

 from .quantizer import AscendQuantizer


-@register_quantization_config("ascend")
+@register_quantization_config(ASCEND_QUATIZATION_METHOD)
 class AscendQuantConfig(QuantizationConfig):
    """Config class for Ascend
    
@@ -58,7 +59,7 @@ class AscendQuantConfig(QuantizationConfig):

    @classmethod
    def get_name(cls) -> str:
-        return "ascend"
+        return ASCEND_QUATIZATION_METHOD

    @classmethod
    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
@@ -81,7 +82,7 @@ class AscendQuantConfig(QuantizationConfig):
    def override_quantization_method(cls, hf_quant_cfg,
                                     user_quant) -> Optional[str]:
        if torch.npu.is_available():
-            return "ascend"
+            return ASCEND_QUATIZATION_METHOD
        return None

    def get_quant_method(self, layer: torch.nn.Module,