enable online serving quantization (#877)
For online serving, "ascend" quantization method is not a choice natively, so we need to add "ascend" quantization method to quantization methods list and the user can enable quantization using "vllm serve --quantization ascend" command. --------- Signed-off-by: 22dimensions <waitingwind@foxmail.com>
This commit is contained in:
@@ -25,7 +25,7 @@ from vllm.logger import logger
|
||||
from vllm.platforms import Platform, PlatformEnum
|
||||
from vllm.utils import supports_dynamo
|
||||
|
||||
from vllm_ascend.utils import update_aclgraph_sizes
|
||||
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
|
||||
|
||||
CUSTOM_OP_ENABLED = False
|
||||
try:
|
||||
@@ -60,7 +60,7 @@ class NPUPlatform(Platform):
|
||||
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
|
||||
dispatch_key: str = "PrivateUse1"
|
||||
|
||||
supported_quantization: list[str] = ["ascend"]
|
||||
supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD]
|
||||
|
||||
def is_sleep_mode_available(self) -> bool:
|
||||
return True
|
||||
@@ -73,6 +73,15 @@ class NPUPlatform(Platform):
|
||||
from vllm_ascend.utils import adapt_patch
|
||||
adapt_patch(is_global_patch=True)
|
||||
|
||||
# For online serving, "ascend" quantization method is not a choice natively,
|
||||
# so we need to add "ascend" quantization method to quantization methods list
|
||||
# and the user can enable quantization using "vllm serve --quantization ascend".
|
||||
if parser is not None:
|
||||
quant_action = parser._option_string_actions.get('--quantization')
|
||||
if quant_action and hasattr(quant_action, 'choices'):
|
||||
if ASCEND_QUATIZATION_METHOD not in quant_action.choices:
|
||||
quant_action.choices.append(ASCEND_QUATIZATION_METHOD)
|
||||
|
||||
from vllm_ascend.quantization.quant_config import \
|
||||
AscendQuantConfig # noqa: F401
|
||||
|
||||
|
||||
Reference in New Issue
Block a user