model: support nvidia/Llama-3_3-Nemotron-Super-49B-v1 (#9067)

Co-authored-by: Kyle Huang <kylhuang@nvidia.com>
This commit is contained in:
Netanel Haber
2025-08-17 11:48:15 +03:00
committed by GitHub
parent e47800e176
commit 845d12a979
6 changed files with 465 additions and 5 deletions

View File

@@ -341,6 +341,19 @@ class ModelConfig:
"kv_n_heads",
self.hf_config.num_attention_heads,
)
if self.hf_config.model_type in ["nemotron-nas"]:
nkvh = {
self.hf_config.num_attention_heads // block.attention.n_heads_in_group
for block in self.hf_config.block_configs
if not block.attention.no_op
}
if len(nkvh) == 0:
raise RuntimeError("Couldn't determine number of kv heads")
if len(nkvh) > 1:
raise ValueError(
"Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
)
return next(iter(nkvh))
attributes = [
# For Falcon: