enable online serving quantization (#877)

For online serving, "ascend" quantization method is not a choice natively, so we need to add "ascend" quantization method to quantization methods list and the user can enable quantization using "vllm serve --quantization ascend" command. --------- Signed-off-by: 22dimensions <waitingwind@foxmail.com>
2025-05-17 17:36:04 +08:00
parent a8730e7a3c
commit 00e0243561
3 changed files with 17 additions and 5 deletions
--- a/vllm_ascend/platform.py
+++ b/vllm_ascend/platform.py
@@ -25,7 +25,7 @@ from vllm.logger import logger
 from vllm.platforms import Platform, PlatformEnum
 from vllm.utils import supports_dynamo

-from vllm_ascend.utils import update_aclgraph_sizes
+from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes

 CUSTOM_OP_ENABLED = False
 try:
@@ -60,7 +60,7 @@ class NPUPlatform(Platform):
    device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
    dispatch_key: str = "PrivateUse1"

-    supported_quantization: list[str] = ["ascend"]
+    supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD]

    def is_sleep_mode_available(self) -> bool:
        return True
@@ -73,6 +73,15 @@ class NPUPlatform(Platform):
        from vllm_ascend.utils import adapt_patch
        adapt_patch(is_global_patch=True)

+        # For online serving, "ascend" quantization method is not a choice natively,
+        # so we need to add "ascend" quantization method to quantization methods list
+        # and the user can enable quantization using "vllm serve --quantization ascend".
+        if parser is not None:
+            quant_action = parser._option_string_actions.get('--quantization')
+            if quant_action and hasattr(quant_action, 'choices'):
+                if ASCEND_QUATIZATION_METHOD not in quant_action.choices:
+                    quant_action.choices.append(ASCEND_QUATIZATION_METHOD)
+
        from vllm_ascend.quantization.quant_config import \
            AscendQuantConfig  # noqa: F401