From e0b5dbcec13cea0bed7737463052480f7468ce41 Mon Sep 17 00:00:00 2001 From: HAI Date: Thu, 3 Oct 2024 01:52:26 -0700 Subject: [PATCH] [FP8 KV Cache] Avoid KeyError at loading pre-quantized FP8 model with kv_scale (#1559) --- python/sglang/srt/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/sglang/srt/models/llama.py b/python/sglang/srt/models/llama.py index 431250260..930c6838d 100644 --- a/python/sglang/srt/models/llama.py +++ b/python/sglang/srt/models/llama.py @@ -400,6 +400,9 @@ class LlamaForCausalLM(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip loading kv_scale from ckpts towards new design. + if name.endswith(".kv_scale") and name not in params_dict: + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight)