[Feature] Support Deepseek-VL2 (#2798)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Chayenne <zhaochen20@outlook.com>
Co-authored-by: Yi Zhang <1109276519@qq.com>
This commit is contained in:
萝卜菜
2025-03-17 14:07:59 +08:00
committed by GitHub
parent 0212d2e288
commit d6d21640d3
13 changed files with 1259 additions and 2 deletions

View File

@@ -135,6 +135,11 @@ class ModelConfig:
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures:
self.head_dim = 256
self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_text_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
else:
self.attention_arch = AttentionArch.MHA
@@ -362,6 +367,8 @@ def get_hf_text_config(config: PretrainedConfig):
# if transformers config doesn't align with this assumption.
assert hasattr(config.text_config, "num_attention_heads")
return config.text_config
if hasattr(config, "language_config"):
return config.language_config
else:
return config
@@ -465,6 +472,7 @@ multimodal_model_archs = [
"Qwen2_5_VLForConditionalGeneration",
"MiniCPMV",
"MultiModalityCausalLM",
"DeepseekVL2ForCausalLM",
]