Add minimal vLLM 0.16.1 build repo for BI-V150
This commit is contained in:
129
vllm/transformers_utils/configs/kimi_k25.py
Normal file
129
vllm/transformers_utils/configs/kimi_k25.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Kimi-K2.5 Model Configuration.
|
||||
|
||||
This configuration supports video-chunk as an internal modality type.
|
||||
A video-chunk is the smallest independently processable unit of video.
|
||||
"""
|
||||
|
||||
from transformers import DeepseekV3Config
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
|
||||
|
||||
class KimiK25VisionConfig(PretrainedConfig):
|
||||
model_type = "kimi_k25_vision"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# Vision Tower
|
||||
patch_size: int = 14,
|
||||
init_pos_emb_height: int = 64,
|
||||
init_pos_emb_width: int = 64,
|
||||
init_pos_emb_time: int = 4,
|
||||
pos_emb_type: str = "divided_fixed",
|
||||
num_attention_heads: int = 16,
|
||||
num_hidden_layers: int = 27,
|
||||
hidden_size: int = 1152,
|
||||
intermediate_size: int = 4304,
|
||||
merge_kernel_size: tuple[int, int] = (2, 2),
|
||||
video_attn_type: str = "spatial_temporal",
|
||||
merge_type: str = "sd2_tpool",
|
||||
# MM Projector
|
||||
mm_projector_type: str = "patchmerger",
|
||||
mm_hidden_size: int | None = None,
|
||||
projector_hidden_act: str = "gelu",
|
||||
projector_ln_eps: float = 1e-5,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
# Vision Tower
|
||||
self.patch_size = patch_size
|
||||
self.init_pos_emb_height = init_pos_emb_height
|
||||
self.init_pos_emb_width = init_pos_emb_width
|
||||
self.init_pos_emb_time = init_pos_emb_time
|
||||
self.pos_emb_type = pos_emb_type
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.merge_kernel_size = merge_kernel_size
|
||||
self.video_attn_type = video_attn_type
|
||||
self.merge_type = merge_type
|
||||
# MM Projector
|
||||
self.mm_projector_type = mm_projector_type
|
||||
if mm_hidden_size is not None:
|
||||
self.mm_hidden_size = mm_hidden_size
|
||||
else:
|
||||
self.mm_hidden_size = hidden_size
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.projector_ln_eps = projector_ln_eps
|
||||
|
||||
|
||||
class KimiK25Config(PretrainedConfig):
|
||||
"""Kimi-K2.5 model configuration.
|
||||
|
||||
Kimi-K2.5 extends Kimi-K2 with vision support using video-chunks.
|
||||
A video-chunk consists of multiple consecutive frames
|
||||
that are processed together with temporal pooling.
|
||||
|
||||
Args:
|
||||
vision_config: Configuration for the vision tower and projector.
|
||||
text_config: Configuration for the text model (DeepseekV3).
|
||||
ignore_index: The ignore index for the loss function.
|
||||
media_placeholder_token_id: The token ID for media placeholders.
|
||||
pad_token_id: The token ID for padding.
|
||||
"""
|
||||
|
||||
model_type = "kimi_k25"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vision_config: dict | KimiK25VisionConfig | None = None,
|
||||
text_config: dict | DeepseekV3Config | None = None,
|
||||
ignore_index: int = -100,
|
||||
media_placeholder_token_id: int = 163605,
|
||||
pad_token_id: int = 0,
|
||||
use_unified_vision_chunk: bool = False,
|
||||
video_placeholder: str = "<|kimi_k25_video_placeholder|>",
|
||||
**kwargs,
|
||||
):
|
||||
# Vision config
|
||||
if vision_config is None:
|
||||
vision_config = KimiK25VisionConfig()
|
||||
elif isinstance(vision_config, dict):
|
||||
vision_config = KimiK25VisionConfig(**vision_config)
|
||||
self.vision_config: KimiK25VisionConfig = vision_config
|
||||
|
||||
# Text config
|
||||
if text_config is None:
|
||||
text_config = DeepseekV3Config()
|
||||
elif isinstance(text_config, dict):
|
||||
text_config = DeepseekV3Config(**text_config)
|
||||
self.text_config: DeepseekV3Config = text_config
|
||||
|
||||
# Set mm_hidden_size to text hidden size if not explicitly set
|
||||
if self.vision_config.mm_hidden_size == self.vision_config.hidden_size:
|
||||
self.vision_config.mm_hidden_size = self.text_config.hidden_size
|
||||
|
||||
# Other config
|
||||
self.ignore_index = ignore_index
|
||||
self.media_placeholder_token_id = media_placeholder_token_id
|
||||
self.use_unified_vision_chunk = use_unified_vision_chunk
|
||||
self.video_placeholder = video_placeholder
|
||||
|
||||
# Propagate quantization config from text model
|
||||
if getattr(self.text_config, "quantization_config", None) is not None:
|
||||
self.quantization_config = self.text_config.quantization_config
|
||||
|
||||
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
||||
|
||||
@property
|
||||
def hidden_size(self) -> int:
|
||||
"""Get hidden size from text config for compatibility."""
|
||||
return self.text_config.hidden_size
|
||||
|
||||
@property
|
||||
def vocab_size(self) -> int:
|
||||
"""Get vocab size from text config for compatibility."""
|
||||
return self.text_config.vocab_size
|
||||
Reference in New Issue
Block a user