model: support nvidia/Llama-3_3-Nemotron-Super-49B-v1 (#9067)
Co-authored-by: Kyle Huang <kylhuang@nvidia.com>
This commit is contained in:
@@ -341,6 +341,19 @@ class ModelConfig:
|
||||
"kv_n_heads",
|
||||
self.hf_config.num_attention_heads,
|
||||
)
|
||||
if self.hf_config.model_type in ["nemotron-nas"]:
|
||||
nkvh = {
|
||||
self.hf_config.num_attention_heads // block.attention.n_heads_in_group
|
||||
for block in self.hf_config.block_configs
|
||||
if not block.attention.no_op
|
||||
}
|
||||
if len(nkvh) == 0:
|
||||
raise RuntimeError("Couldn't determine number of kv heads")
|
||||
if len(nkvh) > 1:
|
||||
raise ValueError(
|
||||
"Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
|
||||
)
|
||||
return next(iter(nkvh))
|
||||
|
||||
attributes = [
|
||||
# For Falcon:
|
||||
|
||||
Reference in New Issue
Block a user