[2/2] Support Qserve (#6521)

This commit is contained in:
HandH1998
2025-05-24 03:39:18 +08:00
committed by GitHub
parent d2e0881a34
commit 1b2e8f76d9
5 changed files with 268 additions and 5 deletions

View File

@@ -349,6 +349,7 @@ class ModelConfig:
"w8a8_int8",
"w8a8_fp8",
"moe_wna16",
"qoq",
]
compatible_quantization_methods = {
"modelopt_fp4": ["modelopt"],
@@ -458,6 +459,8 @@ def _get_and_verify_dtype(
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype = getattr(config, "torch_dtype", None)
if isinstance(config_dtype, str):
config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
if config_dtype is None:
config_dtype = torch.float32