enable online serving quantization (#877)
For online serving, "ascend" quantization method is not a choice natively, so we need to add "ascend" quantization method to quantization methods list and the user can enable quantization using "vllm serve --quantization ascend" command. --------- Signed-off-by: 22dimensions <waitingwind@foxmail.com>
This commit is contained in:
@@ -25,7 +25,7 @@ from vllm.logger import logger
|
|||||||
from vllm.platforms import Platform, PlatformEnum
|
from vllm.platforms import Platform, PlatformEnum
|
||||||
from vllm.utils import supports_dynamo
|
from vllm.utils import supports_dynamo
|
||||||
|
|
||||||
from vllm_ascend.utils import update_aclgraph_sizes
|
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
|
||||||
|
|
||||||
CUSTOM_OP_ENABLED = False
|
CUSTOM_OP_ENABLED = False
|
||||||
try:
|
try:
|
||||||
@@ -60,7 +60,7 @@ class NPUPlatform(Platform):
|
|||||||
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
|
device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
|
||||||
dispatch_key: str = "PrivateUse1"
|
dispatch_key: str = "PrivateUse1"
|
||||||
|
|
||||||
supported_quantization: list[str] = ["ascend"]
|
supported_quantization: list[str] = [ASCEND_QUATIZATION_METHOD]
|
||||||
|
|
||||||
def is_sleep_mode_available(self) -> bool:
|
def is_sleep_mode_available(self) -> bool:
|
||||||
return True
|
return True
|
||||||
@@ -73,6 +73,15 @@ class NPUPlatform(Platform):
|
|||||||
from vllm_ascend.utils import adapt_patch
|
from vllm_ascend.utils import adapt_patch
|
||||||
adapt_patch(is_global_patch=True)
|
adapt_patch(is_global_patch=True)
|
||||||
|
|
||||||
|
# For online serving, "ascend" quantization method is not a choice natively,
|
||||||
|
# so we need to add "ascend" quantization method to quantization methods list
|
||||||
|
# and the user can enable quantization using "vllm serve --quantization ascend".
|
||||||
|
if parser is not None:
|
||||||
|
quant_action = parser._option_string_actions.get('--quantization')
|
||||||
|
if quant_action and hasattr(quant_action, 'choices'):
|
||||||
|
if ASCEND_QUATIZATION_METHOD not in quant_action.choices:
|
||||||
|
quant_action.choices.append(ASCEND_QUATIZATION_METHOD)
|
||||||
|
|
||||||
from vllm_ascend.quantization.quant_config import \
|
from vllm_ascend.quantization.quant_config import \
|
||||||
AscendQuantConfig # noqa: F401
|
AscendQuantConfig # noqa: F401
|
||||||
|
|
||||||
|
|||||||
@@ -38,11 +38,12 @@ from vllm.model_executor.parameter import PerTensorScaleParameter
|
|||||||
from vllm.model_executor.utils import set_weight_attrs
|
from vllm.model_executor.utils import set_weight_attrs
|
||||||
|
|
||||||
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
|
from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod
|
||||||
|
from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD
|
||||||
|
|
||||||
from .quantizer import AscendQuantizer
|
from .quantizer import AscendQuantizer
|
||||||
|
|
||||||
|
|
||||||
@register_quantization_config("ascend")
|
@register_quantization_config(ASCEND_QUATIZATION_METHOD)
|
||||||
class AscendQuantConfig(QuantizationConfig):
|
class AscendQuantConfig(QuantizationConfig):
|
||||||
"""Config class for Ascend
|
"""Config class for Ascend
|
||||||
|
|
||||||
@@ -58,7 +59,7 @@ class AscendQuantConfig(QuantizationConfig):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_name(cls) -> str:
|
def get_name(cls) -> str:
|
||||||
return "ascend"
|
return ASCEND_QUATIZATION_METHOD
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
|
def get_supported_act_dtypes(cls) -> List[torch.dtype]:
|
||||||
@@ -81,7 +82,7 @@ class AscendQuantConfig(QuantizationConfig):
|
|||||||
def override_quantization_method(cls, hf_quant_cfg,
|
def override_quantization_method(cls, hf_quant_cfg,
|
||||||
user_quant) -> Optional[str]:
|
user_quant) -> Optional[str]:
|
||||||
if torch.npu.is_available():
|
if torch.npu.is_available():
|
||||||
return "ascend"
|
return ASCEND_QUATIZATION_METHOD
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_quant_method(self, layer: torch.nn.Module,
|
def get_quant_method(self, layer: torch.nn.Module,
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ else:
|
|||||||
# Maximum number of graphs that can be captured by ACL Graph
|
# Maximum number of graphs that can be captured by ACL Graph
|
||||||
MAX_CAPTURE_SIZE = 1920
|
MAX_CAPTURE_SIZE = 1920
|
||||||
|
|
||||||
|
ASCEND_QUATIZATION_METHOD = "ascend"
|
||||||
|
|
||||||
|
|
||||||
def try_register_lib(lib_name: str, lib_info: str = ""):
|
def try_register_lib(lib_name: str, lib_info: str = ""):
|
||||||
import importlib
|
import importlib
|
||||||
|
|||||||
Reference in New Issue
Block a user