Fix loading KV quantization scale; Enable modelopt kv cache (#4686)

Co-authored-by: qingquansong <ustcsqq@gmail.com>
This commit is contained in:
Yun Dai
2025-04-08 09:11:35 -07:00
committed by GitHub
parent 88d6fd9a11
commit 2695ab0537
38 changed files with 151 additions and 76 deletions

View File

@@ -239,7 +239,7 @@ class ModelConfig:
# check if is modelopt model -- modelopt doesn't have corresponding field
# in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
# example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
is_local = os.path.isdir(self.model_path)
is_local = os.path.exists(self.model_path)
modelopt_quant_config = {"quant_method": "modelopt"}
if not is_local:
from huggingface_hub import HfApi