Integration of TurboMind AWQ (#2828)

Co-authored-by: root <bjmsong@126.com>
This commit is contained in:
bjmsong
2025-01-13 20:14:16 +08:00
committed by GitHub
parent 51ab3ccf47
commit 17de02f98d
8 changed files with 411 additions and 2 deletions

View File

@@ -14,6 +14,7 @@
import json
import logging
import sys
from enum import IntEnum, auto
from typing import List, Optional, Set, Union
@@ -230,7 +231,7 @@ class ModelConfig:
# Parse quantization method from the HF model config, if available.
quant_cfg = self._parse_quant_hf_config()
if quant_cfg is not None:
if quant_cfg is not None and not quantization_in_turbomind(self.quantization):
quant_method = quant_cfg.get("quant_method", "").lower()
# Detect which checkpoint is it
@@ -401,3 +402,10 @@ def is_multimodal_model(model_architectures: List[str]):
def is_encoder_decoder_model(model_architectures: List[str]):
return "MllamaForConditionalGeneration" in model_architectures
def quantization_in_turbomind(quantization: str) -> bool:
if quantization in ["awq_turbomind"]:
return True
else:
return False