chore: upgrade transformers 4.52.3 (#6575)

Co-authored-by: Mick <mickjagger19@icloud.com>
2025-05-25 22:49:58 -07:00
parent 84147254c9
commit 7eb9d8e594
5 changed files with 152 additions and 125 deletions
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -41,7 +41,7 @@ runtime_common = [
    "soundfile==0.13.1",
    "scipy",
    "torchao==0.9.0",
-    "transformers==4.51.1",
+    "transformers==4.52.3",
    "uvicorn",
    "uvloop",
    "xgrammar==0.1.19",
--- a/python/sglang/srt/configs/internvl.py
+++ b/python/sglang/srt/configs/internvl.py
@@ -7,11 +7,8 @@ import sentencepiece as spm
 from transformers import (
    TOKENIZER_MAPPING,
    LlamaConfig,
-    Phi3Config,
    PretrainedConfig,
    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    Qwen2Config,
 )

 from sglang.utils import logger
@@ -302,24 +299,23 @@ class InternVLChatConfig(PretrainedConfig):
            )

        if llm_config is None:
-            # TODO: There might still be a bug in transformers version 4.44 and above.
-            llm_config = {"architectures": [""]}
+            llm_config = {"architectures": ["InternLM2ForCausalLM"]}
            logger.info(
                "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
            )
+
        self.vision_config = InternVisionConfig(**vision_config)
-        if llm_config["architectures"][0] == "LlamaForCausalLM":
+        if llm_config.get("architectures")[0] == "LlamaForCausalLM":
            self.llm_config = LlamaConfig(**llm_config)
-        elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
+        elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
            self.llm_config = InternLM2Config(**llm_config)
-        elif llm_config["architectures"][0] == "Phi3ForCausalLM":
-            self.llm_config = Phi3Config(**llm_config)
-        elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
-            self.llm_config = Qwen2Config(**llm_config)
        else:
            raise ValueError(
-                "Unsupported architecture: {}".format(llm_config["architectures"][0])
+                "Unsupported architecture: {}".format(
+                    llm_config.get("architectures")[0]
+                )
            )
+
        self.use_backbone_lora = use_backbone_lora
        self.use_llm_lora = use_llm_lora
        self.pad2square = pad2square
--- a/python/sglang/srt/configs/model_config.py
+++ b/python/sglang/srt/configs/model_config.py
@@ -196,6 +196,21 @@ class ModelConfig:
            self.v_head_dim = self.hf_text_config.v_head_dim
            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
        else:
+            if (
+                "MistralModel" in self.hf_config.architectures
+                or "MixtralForCausalLM" in self.hf_config.architectures
+            ):
+                if getattr(self, "head_dim", None) is None:
+                    self.head_dim = (
+                        self.hf_config.hidden_size // self.hf_config.num_attention_heads
+                    )
+                    # In transformers==4.52.3, the head_dim is null in MistralConfig
+                    if (
+                        not hasattr(self.hf_text_config, "head_dim")
+                        or self.hf_text_config.head_dim is None
+                    ):
+                        setattr(self.hf_text_config, "head_dim", self.head_dim)
+
            self.attention_arch = AttentionArch.MHA

        self.num_attention_heads = self.hf_text_config.num_attention_heads
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -26,6 +26,7 @@ from transformers import (
    AutoModelForCausalLM,
    AutoModelForVision2Seq,
    AutoProcessor,
+    GenerationConfig,
 )

 from sglang.srt.entrypoints.engine import Engine
@@ -382,13 +383,17 @@ class HFRunner:
                model = base_model

            outputs = model.generate(
-                input_ids,
-                do_sample=False,
-                temperature=None,
-                top_p=None,
-                max_new_tokens=max_new_tokens,
-                return_dict_in_generate=True,
-                output_scores=(not output_str_only),
+                input_ids=input_ids,
+                generation_config=GenerationConfig(
+                    do_sample=False,
+                    temperature=None,
+                    top_p=None,
+                    max_new_tokens=max_new_tokens,
+                    return_dict_in_generate=True,
+                    output_scores=(not output_str_only),
+                    # make sure to disable compile
+                    disable_compile=True,
+                ),
            )

            text = tokenizer.decode(