Fix loading KV quantization scale; Enable modelopt kv cache (#4686)
Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
@@ -239,7 +239,7 @@ class ModelConfig:
|
||||
# check if is modelopt model -- modelopt doesn't have corresponding field
|
||||
# in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
|
||||
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
|
||||
is_local = os.path.isdir(self.model_path)
|
||||
is_local = os.path.exists(self.model_path)
|
||||
modelopt_quant_config = {"quant_method": "modelopt"}
|
||||
if not is_local:
|
||||
from huggingface_hub import HfApi
|
||||
|
||||
Reference in New Issue
Block a user