Fix loading KV quantization scale; Enable modelopt kv cache (#4686)

Co-authored-by: qingquansong <ustcsqq@gmail.com>
2025-04-08 09:11:35 -07:00
parent 88d6fd9a11
commit 2695ab0537
38 changed files with 151 additions and 76 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -239,7 +239,7 @@ class ModelConfig:
            # check if is modelopt model -- modelopt doesn't have corresponding field
            # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
            # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
-            is_local = os.path.isdir(self.model_path)
+            is_local = os.path.exists(self.model_path)
            modelopt_quant_config = {"quant_method": "modelopt"}
            if not is_local:
                from huggingface_hub import HfApi