Fix incorrect KV cache allocation for MTP models. (#8482)

Co-authored-by: Stefan He <hebiaobuaa@gmail.com>
2025-07-28 22:54:50 -07:00
parent 0ce84c822b
commit fb16fbaf52
2 changed files with 18 additions and 13 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -261,6 +261,9 @@ class ModelConfig:
            self.num_key_value_heads = self.num_attention_heads
        self.hidden_size = self.hf_text_config.hidden_size
        self.num_hidden_layers = self.hf_text_config.num_hidden_layers
+        self.num_nextn_predict_layers = getattr(
+            self.hf_text_config, "num_nextn_predict_layers", None
+        )
        self.vocab_size = self.hf_text_config.vocab_size

        # Verify quantization