model: support nvidia/Llama-3_3-Nemotron-Super-49B-v1 (#9067)

Co-authored-by: Kyle Huang <kylhuang@nvidia.com>
2025-08-17 11:48:15 +03:00
parent e47800e176
commit 845d12a979
6 changed files with 465 additions and 5 deletions
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -341,6 +341,19 @@ class ModelConfig:
                "kv_n_heads",
                self.hf_config.num_attention_heads,
            )
+        if self.hf_config.model_type in ["nemotron-nas"]:
+            nkvh = {
+                self.hf_config.num_attention_heads // block.attention.n_heads_in_group
+                for block in self.hf_config.block_configs
+                if not block.attention.no_op
+            }
+            if len(nkvh) == 0:
+                raise RuntimeError("Couldn't determine number of kv heads")
+            if len(nkvh) > 1:
+                raise ValueError(
+                    "Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
+                )
+            return next(iter(nkvh))

        attributes = [
            # For Falcon: