diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 6ad802367..dc4198b52 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -369,6 +369,9 @@ class MixtralForCausalLM(nn.Module): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + # Skip loading kv_scale from ckpts towards new design. + if name.endswith(".kv_scale") and name not in params_dict: + continue if name is None: continue