[GLM4.1V and GLM4.5V] Add vision transformer num_dummy_head support: max tp=4 -> max tp=8 (#9059)

This commit is contained in:
Binyao Jiang
2025-08-18 14:40:13 -07:00
committed by GitHub
parent 98b44e9e56
commit c2fbf60f39
9 changed files with 150 additions and 102 deletions

View File

@@ -11,6 +11,7 @@ from sglang.srt.distributed import (
get_tensor_model_parallel_world_size,
)
from sglang.srt.hf_transformers_utils import get_processor
from sglang.srt.layers.attention import vision_utils
from sglang.srt.layers.logits_processor import LogitsProcessor
from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
from sglang.srt.layers.pooler import Pooler, PoolingType
@@ -40,6 +41,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
config.moe_layer_freq = 1
self.config = config
vision_utils.update_vit_attn_dummy_heads_config(self.config)
self.tp_size = get_tensor_model_parallel_world_size()
self.quant_config = quant_config
self.determine_num_fused_shared_experts("Glm4MoeForCausalLM")
@@ -385,6 +387,10 @@ class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
weight_loader = getattr(
param, "weight_loader", default_weight_loader
)
if "visual" in name:
loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
self.config, name, loaded_weight
)
weight_loader(param, loaded_weight)