# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 from transformers.configuration_utils import PretrainedConfig class VisionEncoderConfig(PretrainedConfig): model_type: str = "vision" model_name: str = "vit_so400m_patch14_siglip_384.webli" image_size: int = 384 patch_size: int = 16 width: int = 1024 layers: int = 24 heads: int = 16 mlp_ratio: int = 4 global_pool: str = "map" ignore_head: bool = True class_token: bool = False num_classes: int = 0 use_checkpoint: bool = False weight_init: str = "skip" deterministic: bool = False num_recomputing_layers: int = 0 def __init__(self, model_name: str = "vit_so400m_patch14_siglip_384.webli", image_size: int = 384, patch_size: int = 16, width: int = 1024, layers: int = 24, heads: int = 16, mlp_ratio: int = 4, global_pool: str = "map", ignore_head: bool = True, class_token: bool = False, num_classes: int = 0, use_checkpoint: bool = False, **kwargs): self.model_name = model_name self.image_size = image_size self.patch_size = patch_size self.width = width self.layers = layers self.heads = heads self.mlp_ratio = mlp_ratio self.global_pool = global_pool self.ignore_head = ignore_head self.class_token = class_token self.num_classes = num_classes self.use_checkpoint = use_checkpoint super().__init__(**kwargs) class MlpProjectorConfig(PretrainedConfig): model_type = "mlp_projector" projector_type: str = "downsample_mlp_gelu" input_dim: int = 1152 n_embed: int = 2048 depth: int = 2 mlp_ratio: int = 1 downsample_ratio: int = 2 token_pooling: bool = False def __init__(self, projector_type: str = "downsample_mlp_gelu", input_dim: int = 1152, n_embed: int = 2048, depth: int = 2, mlp_ratio: int = 1, downsample_ratio: int = 2, **kwargs): self.projector_type = projector_type self.input_dim = input_dim self.n_embed = n_embed self.depth = depth self.mlp_ratio = mlp_ratio self.downsample_ratio = downsample_ratio super().__init__(**kwargs) class DeepseekV2Config(PretrainedConfig): model_type = "deepseek_v2" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=102400, hidden_size=4096, intermediate_size=11008, moe_intermediate_size=1407, num_hidden_layers=30, num_attention_heads=32, num_key_value_heads=32, n_shared_experts=None, n_routed_experts=None, ep_size=1, routed_scaling_factor=1.0, kv_lora_rank=512, q_lora_rank=1536, qk_rope_head_dim=64, v_head_dim=128, qk_nope_head_dim=128, topk_method='gready', n_group=None, topk_group=None, num_experts_per_tok=None, moe_layer_freq=1, first_k_dense_replace=0, norm_topk_prob=False, scoring_func='softmax', aux_loss_alpha=0.001, seq_aux=True, hidden_act="silu", max_position_embeddings=2048, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, pad_token_id=None, bos_token_id=100000, eos_token_id=100001, pretraining_tp=1, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, attention_bias=False, attention_dropout=0.0, use_mla=True, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.moe_intermediate_size = moe_intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.n_shared_experts = n_shared_experts self.n_routed_experts = n_routed_experts self.ep_size = ep_size self.routed_scaling_factor = routed_scaling_factor self.kv_lora_rank = kv_lora_rank self.q_lora_rank = q_lora_rank self.qk_rope_head_dim = qk_rope_head_dim self.v_head_dim = v_head_dim self.qk_nope_head_dim = qk_nope_head_dim self.topk_method = topk_method self.n_group = n_group self.topk_group = topk_group self.num_experts_per_tok = num_experts_per_tok self.moe_layer_freq = moe_layer_freq self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.scoring_func = scoring_func self.aux_loss_alpha = aux_loss_alpha self.seq_aux = seq_aux # for backward compatibility if num_key_value_heads is None: num_key_value_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = float(rms_norm_eps) self.pretraining_tp = pretraining_tp self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_mla = use_mla super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs, ) class DeepseekVLV2Config(PretrainedConfig): model_type = "deepseek_vl_v2" vision_config: VisionEncoderConfig projector_config: MlpProjectorConfig tile_tag: str = "2D" global_view_pos: str = "head" candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), ) def __init__(self, tile_tag: str = "tile_tag", global_view_pos: str = "head", candidate_resolutions: tuple[tuple[int, int]] = ((384, 384), ), **kwargs): super().__init__(**kwargs) vision_config = kwargs.get("vision_config", {}) self.vision_config = VisionEncoderConfig(**vision_config) projector_config = kwargs.get("projector_config", {}) self.projector_config = MlpProjectorConfig(**projector_config) language_config = kwargs.get("language_config", {}) self.text_config = DeepseekV2Config(**language_config) self.tile_tag = tile_tag self.global_view_pos = global_view_pos self.candidate_resolutions = candidate_resolutions self.vocab_size = self.text_config.vocab_size