[CPU] Fix TP padding issue on Phi-4 (#8289)

2025-08-18 07:25:26 +08:00
parent b341b7dbce
commit ebbb75e917
5 changed files with 93 additions and 27 deletions
--- a/python/sglang/srt/hf_transformers_utils.py
+++ b/python/sglang/srt/hf_transformers_utils.py
@@ -129,6 +129,25 @@ def get_config(
    config = AutoConfig.from_pretrained(
        model, trust_remote_code=trust_remote_code, revision=revision, **kwargs
    )
+    if (
+        config.architectures is not None
+        and config.architectures[0] == "Phi4MMForCausalLM"
+    ):
+        # Phi4MMForCausalLM uses a hard-coded vision_config. See:
+        # https://github.com/vllm-project/vllm/blob/6071e989df1531b59ef35568f83f7351afb0b51e/vllm/model_executor/models/phi4mm.py#L71
+        # We set it here to support cases where num_attention_heads is not divisible by the TP size.
+        from transformers import SiglipVisionConfig
+
+        vision_config = {
+            "hidden_size": 1152,
+            "image_size": 448,
+            "intermediate_size": 4304,
+            "model_type": "siglip_vision_model",
+            "num_attention_heads": 16,
+            "num_hidden_layers": 26,  # Model is originally 27-layer, we only need the first 26 layers for feature extraction.
+            "patch_size": 14,
+        }
+        config.vision_config = SiglipVisionConfig(**vision_config)
    text_config = get_hf_text_config(config=config)

    if isinstance(model, str) and text_config is not None: