commit e6d684576f3a810920165a8a471604015b829fdc Author: ModelHub XC Date: Mon May 25 10:42:12 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: AIDC-AI/Ovis2.5-2B Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..d2fc868 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,51 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*.tfevents* filter=lfs diff=lfs merge=lfs -text +*.db* filter=lfs diff=lfs merge=lfs -text +*.ark* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text +**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.gguf* filter=lfs diff=lfs merge=lfs -text +*.ggml filter=lfs diff=lfs merge=lfs -text +*.llamafile* filter=lfs diff=lfs merge=lfs -text +*.pt2 filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text + +merges.txt filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text +vocab.json filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..522e362 --- /dev/null +++ b/LICENSE @@ -0,0 +1,10 @@ +Copyright (C) 2025 AIDC-AI +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..c75e4d6 --- /dev/null +++ b/NOTICE @@ -0,0 +1,5 @@ +Copyright (C) 2025 AIDC-AI +Licensed under the Apache 2.0 (the "License"). +The model was trained based on the following models: +1. Qwen3-1.7B (https://huggingface.co/Qwen/Qwen3-1.7B), license: Apache License 2.0 (https://huggingface.co/Qwen/Qwen3-1.7B/blob/main/LICENSE). +2. Siglip2 (https://huggingface.co/google/siglip2-so400m-patch16-512), license: Apache License 2.0 (https://choosealicense.com/licenses/apache-2.0/). \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7aa26ad --- /dev/null +++ b/README.md @@ -0,0 +1,337 @@ +--- +license: apache-2.0 +datasets: +- AIDC-AI/Ovis-dataset +library_name: transformers +tags: +- MLLM +pipeline_tag: image-text-to-text +language: +- en +- zh +--- +# Ovis2.5-2B +
+ +
+ +

+ technical report + code + demo + models +

+ + +## Introduction + +We are pleased to announce the release of **Ovis2.5**, the successor to Ovis2, designed for native-resolution visual perception and enhanced multimodal reasoning. +It integrates a native-resolution vision transformer (NaViT) that processes images at their original, variable resolutions, eliminating the need for fixed-resolution tiling and preserving both fine details and global layout—crucial for visually dense content such as charts and diagrams. +To strengthen reasoning, Ovis2.5 is trained not only on linear chain-of-thought (CoT) but also on reflective reasoning, including self-checking and revision. +This advanced capability is available at inference as an optional *thinking mode*, enabling users to trade latency for higher accuracy on complex inputs. + +Building on these advances, **Ovis2.5-9B** achieves an average score of 78.3 on the OpenCompass multimodal evaluation suite (SOTA among open-source MLLMs under 40B parameters), while the lightweight **Ovis2.5-2B** scores 73.9, continuing the “small model, big performance” philosophy for resource-constrained scenarios. + + +
+ +
+ +**Key Features** +* **Native-Resolution Perception** — NaViT vision encoder preserves fine details and global structure without lossy tiling. +* **Deep-Reasoning Capability** — Optional *thinking mode* for self-checking and revision beyond linear CoT. *Thinking budget* supported. +* **Chart & Document OCR** — State-of-the-art at its scale for complex chart analysis, document understanding (including tables and forms), and OCR. +* **Broad Task Coverage** — Demonstrates leading performance on image reasoning, video understanding, and grounding benchmarks, showcasing strong general multimodal capability. + +
+ +
+ +## Quick Inference + +Below is a simple example demonstrating how to run Ovis2.5 with a single image input. For accelerated inference with **vLLM**, refer to [GitHub](https://github.com/AIDC-AI/Ovis). + +First, install the required dependencies: + +```bash +pip install torch==2.4.0 transformers==4.51.3 numpy==1.25.0 pillow==10.3.0 moviepy==1.0.3 +pip install flash-attn==2.7.0.post2 --no-build-isolation +``` + +Then, run the following code. + +```python +import torch +import requests +from PIL import Image +from transformers import AutoModelForCausalLM + +MODEL_PATH = "AIDC-AI/Ovis2.5-2B" + +# Thinking mode & budget +enable_thinking = True +enable_thinking_budget = True # Only effective if enable_thinking is True. + +# Total tokens for thinking + answer. Ensure: max_new_tokens > thinking_budget + 25 +max_new_tokens = 3072 +thinking_budget = 2048 + +model = AutoModelForCausalLM.from_pretrained( + MODEL_PATH, + torch_dtype=torch.bfloat16, + trust_remote_code=True +).cuda() + +messages = [{ + "role": "user", + "content": [ + {"type": "image", "image": Image.open(requests.get("https://cdn-uploads.huggingface.co/production/uploads/658a8a837959448ef5500ce5/TIlymOb86R6_Mez3bpmcB.png", stream=True).raw)}, + {"type": "text", "text": "Calculate the sum of the numbers in the middle box in figure (c)."}, + ], +}] + +input_ids, pixel_values, grid_thws = model.preprocess_inputs( + messages=messages, + add_generation_prompt=True, + enable_thinking=enable_thinking +) +input_ids = input_ids.cuda() +pixel_values = pixel_values.cuda() if pixel_values is not None else None +grid_thws = grid_thws.cuda() if grid_thws is not None else None + +outputs = model.generate( + inputs=input_ids, + pixel_values=pixel_values, + grid_thws=grid_thws, + enable_thinking=enable_thinking, + enable_thinking_budget=enable_thinking_budget, + max_new_tokens=max_new_tokens, + thinking_budget=thinking_budget, +) + +response = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True) +print(response) +``` + +The thinking and thinking budget logic can be applied in the same way for multi-image, video and pure text scenarios. + +**Note (answer extraction for CoT/Thinking):** +To make evaluation and usage easier, we recommend appending a fixed suffix to prompts when using chain-of-thought (CoT) or thinking mode. This ensures the model clearly outputs a final answer that can be extracted programmatically: + +``` +End your response with 'Final answer: '. +``` + +For example: + +``` +Calculate the sum of the numbers in the middle box in figure (c). +End your response with 'Final answer: '. +``` + + +**Tip:** The sections below include an optional streaming helper (compatible with two-phase thinking/budget runs) and extra inference modes: multi-image, video, and text-only. + +
+Optional: Streaming (Advanced) + +To support thinking budget, we modified the implementation of the Ovis `generate` method and the default `TextIteratorStreamer` is now incompatible. If you need to stream model output, be sure to use the helper class below. + +```python +# --- Budget-aware streamer helper --- +from transformers import TextIteratorStreamer + +class BudgetAwareTextStreamer(TextIteratorStreamer): + """A streamer compatible with Ovis two-phase generation. + + Call .manual_end() after generation to flush any remaining text. + """ + def manual_end(self): + if len(self.token_cache) > 0: + text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs) + printable_text = text[self.print_len:] + self.token_cache = [] + self.print_len = 0 + else: + printable_text = "" + self.next_tokens_are_prompt = True + self.on_finalized_text(printable_text, stream_end=True) + + # Disable base class's end hook; we'll finalize via manual_end() + def end(self): + pass +``` + +Example usage: + +```python +streamer = BudgetAwareTextStreamer( + model.text_tokenizer, + skip_prompt=True, + skip_special_tokens=True +) + +outputs = model.generate( + inputs=input_ids, + pixel_values=pixel_values, + grid_thws=grid_thws, + enable_thinking=enable_thinking, + enable_thinking_budget=enable_thinking_budget, + max_new_tokens=max_new_tokens, + thinking_budget=thinking_budget, + streamer=streamer +) + +``` + +
+ +
+Example: Multi-image +Demonstrates how to run inference with multiple images and a related question. + +```python +# Multi-image inference +multi_image_files = [ + "/path/to/image_1.jpg", + "/path/to/image_2.jpg", + "/path/to/image_3.jpg", +] + +content = [{"type": "image", "image": Image.open(p).convert("RGB")} for p in multi_image_files] +content.append({"type": "text", "text": "Describe the images."}) +messages = [{"role": "user", "content": content}] + +input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, max_pixels=896*896) +input_ids = input_ids.cuda() +pixel_values = pixel_values.cuda().to(model.dtype) if pixel_values is not None else None +grid_thws = grid_thws.cuda() if grid_thws is not None else None + +with torch.no_grad(): + outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws, + max_new_tokens=1024, do_sample=True, + eos_token_id=model.text_tokenizer.eos_token_id, + pad_token_id=model.text_tokenizer.pad_token_id) +print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +
+ +
+Example: Video +Demonstrates how to run inference on a video by sampling multiple frames and asking the model to describe the content. + +```python +# Video inference +from moviepy.editor import VideoFileClip # pip install moviepy==1.0.3 + +video_file = "/path/to/video_1.mp4" +num_frames = 8 + +with VideoFileClip(video_file) as clip: + total_frames = int(clip.fps * clip.duration) + indices = [int(i * total_frames / num_frames) for i in range(num_frames)] + frames = [Image.fromarray(clip.get_frame(t)) for t in (idx / clip.fps for idx in indices)] + +messages = [{"role": "user", "content": [ + {"type": "video", "video": frames}, + {"type": "text", "text": "Describe this video in detail."}, +]}] + +input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, max_pixels=896*896) +input_ids = input_ids.cuda() +pixel_values = pixel_values.cuda().to(model.dtype) if pixel_values is not None else None +grid_thws = grid_thws.cuda() if grid_thws is not None else None + +with torch.no_grad(): + outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws, + max_new_tokens=1024, do_sample=True, + eos_token_id=model.text_tokenizer.eos_token_id, + pad_token_id=model.text_tokenizer.pad_token_id) +print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +
+ +
+Example: Text-only +Demonstrates how to run inference using only text input without any images or videos. + +```python +# Text-only inference +messages = [{"role": "user", "content": "Hi, please introduce Yellow Mountain."}] + +input_ids, _, _ = model.preprocess_inputs(messages=messages, add_generation_prompt=True) +input_ids = input_ids.cuda() + +with torch.no_grad(): + outputs = model.generate(inputs=input_ids, max_new_tokens=1024, do_sample=True, + eos_token_id=model.text_tokenizer.eos_token_id, + pad_token_id=model.text_tokenizer.pad_token_id) +print(model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` + +
+ +To enable grounding, end your prompt with `Please provide the bounding box coordinates.` (for boxes) or `Please provide the point coordinates.` (for points). To target a specific object, wrap its description in `` tags, e.g.: + +```text +Find the red apple in the image. Please provide the bounding box coordinates. +``` + +Coordinates are normalized to `[0,1)` with the origin `(0,0)` at the top-left corner of the image. + +* Point: `(x,y)` +* Bounding box: `(x1,y1),(x2,y2)` where `(x1,y1)` is top-left, `(x2,y2)` is bottom-right. +* Multiple results can be listed in square brackets: `[(...),(...) ]` + +Example: + +```text +The image features a serene scene with three birds[ + (0.401,0.526),(0.430,0.557), + (0.489,0.494),(0.516,0.526), + (0.296,0.529),(0.324,0.576) +] flying in formation against a clear blue sky. +``` + + + +## Model Zoo + +| Ovis MLLMs | ViT | LLM | Model Weights | Demo | +|:-----------|:-----------------------:|:---------------------:|:-------------------------------------------------------:|:--------------------------------------------------------:| +| Ovis2.5-2B | siglip2-so400m-patch16-512 | Qwen3-1.7B | [Huggingface](https://huggingface.co/AIDC-AI/Ovis2.5-2B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis2.5-2B) | +| Ovis2.5-9B | siglip2-so400m-patch16-512 | Qwen3-8B | [Huggingface](https://huggingface.co/AIDC-AI/Ovis2.5-9B) | [Space](https://huggingface.co/spaces/AIDC-AI/Ovis2.5-9B) | + +## Performance +We evaluate Ovis2.5 using [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), as employed in the OpenCompass multimodal and reasoning evaluation suite. + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/637aebed7ce76c3b834cea37/zYtwH4Yw6q6591en_FVX-.png) + +![image/png](https://cdn-uploads.huggingface.co/production/uploads/637aebed7ce76c3b834cea37/zWbsInYCHZYEPlY75xrRd.png) + + +## Citation +If you find Ovis useful, please consider citing the paper +```bibtex +@article{lu2025ovis25technicalreport, + title={Ovis2.5 Technical Report}, + author={Shiyin Lu and Yang Li and Yu Xia and Yuwei Hu and Shanshan Zhao and Yanqing Ma and Zhichao Wei and Yinglun Li and Lunhao Duan and Jianshan Zhao and Yuxuan Han and Haijun Li and Wanying Chen and Junke Tang and Chengkun Hou and Zhixing Du and Tianli Zhou and Wenjie Zhang and Huping Ding and Jiahe Li and Wen Li and Gui Hu and Yiliang Gu and Siran Yang and Jiamang Wang and Hailong Sun and Yibo Wang and Hui Sun and Jinlong Huang and Yuping He and Shengze Shi and Weihong Zhang and Guodong Zheng and Junpeng Jiang and Sensen Gao and Yi-Feng Wu and Sijia Chen and Yuhui Chen and Qing-Guo Chen and Zhao Xu and Weihua Luo and Kaifu Zhang}, + year={2025}, + journal={arXiv:2508.11737} +} + +@article{lu2024ovis, + title={Ovis: Structural Embedding Alignment for Multimodal Large Language Model}, + author={Shiyin Lu and Yang Li and Qing-Guo Chen and Zhao Xu and Weihua Luo and Kaifu Zhang and Han-Jia Ye}, + year={2024}, + journal={arXiv:2405.20797} +} +``` + +## License +This project is licensed under the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0.txt) (SPDX-License-Identifier: Apache-2.0). + +## Disclaimer +We used compliance-checking algorithms during the training process, to ensure the compliance of the trained model to the best of our ability. Due to the complexity of the data and the diversity of language model usage scenarios, we cannot guarantee that the model is completely free of copyright issues or improper content. If you believe anything infringes on your rights or generates improper content, please contact us, and we will promptly address the matter. diff --git a/added_tokens.json b/added_tokens.json new file mode 100644 index 0000000..b54f913 --- /dev/null +++ b/added_tokens.json @@ -0,0 +1,28 @@ +{ + "": 151668, + "": 151658, + "": 151666, + "": 151667, + "": 151657, + "": 151665, + "<|box_end|>": 151649, + "<|box_start|>": 151648, + "<|endoftext|>": 151643, + "<|file_sep|>": 151664, + "<|fim_middle|>": 151660, + "<|fim_pad|>": 151662, + "<|fim_prefix|>": 151659, + "<|fim_suffix|>": 151661, + "<|im_end|>": 151645, + "<|im_start|>": 151644, + "<|image_pad|>": 151655, + "<|object_ref_end|>": 151647, + "<|object_ref_start|>": 151646, + "<|quad_end|>": 151651, + "<|quad_start|>": 151650, + "<|repo_name|>": 151663, + "<|video_pad|>": 151656, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|vision_start|>": 151652 +} diff --git a/chat_template.json b/chat_template.json new file mode 100644 index 0000000..622708c --- /dev/null +++ b/chat_template.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{%- for message in messages %}{{- '<|im_start|>' + message.role + '\n'}}{%- if message.role == 'system' or message.role == 'user' %}{%- if message.content is string %}{{- message.content | replace('', '') | replace('')[-1].lstrip('\n') %}{{- content }}{%- else %}{{- raise_exception('Invalid role. Supported roles are system, user, assistant.')}}{%- endif %}{{- '<|im_end|>\n'}}{%- endfor %}{%- if add_generation_prompt %}{{- '<|im_start|>assistant\n' }}{%- if enable_thinking is defined and enable_thinking is false %}{{- '\n\n\n\n' }}{%- endif %}{%- endif %}" +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..3a02d6f --- /dev/null +++ b/config.json @@ -0,0 +1,73 @@ +{ + "architectures": [ + "Ovis2_5" + ], + "auto_map": { + "AutoConfig": "configuration_ovis2_5.Ovis2_5_Config", + "AutoModelForCausalLM": "modeling_ovis2_5.Ovis2_5" + }, + "conversation_formatter_class": "Qwen3ConversationFormatter", + "hidden_size": 2048, + "vocab_size": 151936, + "num_attention_heads": 32, + "max_position_embeddings": 40960, + "llm_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen/Qwen3-1.7B", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sliding_window": null, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 + }, + "model_type": "ovis2_5", + "torch_dtype": "bfloat16", + "transformers_version": "4.51.3", + "use_cache": true, + "visual_vocab_size": 65536, + "vit_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "google/siglip2-so400m-patch16-512", + "attention_dropout": 0.0, + "fullatt_block_indexes": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "hidden_stride": 2, + "image_size": 512, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip2_navit", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 27, + "num_patches": -1, + "patch_size": 16, + "preserve_original_pe": true, + "temporal_patch_size": 1, + "torch_dtype": "float32", + "use_rope": true, + "window_size": 112 + } +} diff --git a/configuration.json b/configuration.json new file mode 100644 index 0000000..4aef15d --- /dev/null +++ b/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true} \ No newline at end of file diff --git a/configuration_ovis2_5.py b/configuration_ovis2_5.py new file mode 100644 index 0000000..0a90e12 --- /dev/null +++ b/configuration_ovis2_5.py @@ -0,0 +1,96 @@ +from typing import Any, Optional, List, Union + +from transformers import Qwen3Config +from transformers.configuration_utils import PretrainedConfig + +__all__ = ["Siglip2NavitConfig", "Ovis2_5_Config"] + + +class Siglip2NavitConfig(PretrainedConfig): + """This is the configuration class to store the configuration of an [`AIMv2Model`]. + + Instantiating a configuration with the defaults will yield a similar configuration + to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224). + + Args: + hidden_size: Dimension of the hidden representations. + intermediate_size: Dimension of the SwiGLU representations. + num_hidden_layers: Number of hidden layers in the Transformer. + num_attention_heads: Number of attention heads for each attention layer + in the Transformer. + num_channels: Number of input channels. + image_size: Image size. + patch_size: Patch size. + rms_norm_eps: Epsilon value used for the RMS normalization layer. + attention_dropout: Dropout ratio for attention probabilities. + projection_dropout: Dropout ratio for the projection layer after the attention. + qkv_bias: Whether to add a bias to the queries, keys and values. + use_bias: Whether to add a bias in the feed-forward and projection layers. + kwargs: Keyword arguments for the [`PretrainedConfig`]. + """ + + model_type: str = "siglip2_navit" + + def __init__( + self, + hidden_size: int = 1024, + intermediate_size: int = 4096, + num_hidden_layers: int = 24, + num_attention_heads: int = 16, + num_channels: int = 3, + num_patches: int = -1, + image_size: int = 512, + patch_size: int = 16, + hidden_act: str="gelu_pytorch_tanh", + layer_norm_eps: float = 1e-6, + attention_dropout: float = 0.0, + hidden_stride: int = 2, + window_size: int = 112, + fullatt_block_indexes: Optional[list] = None, + temporal_patch_size: int = 1, + preserve_original_pe: bool = True, + use_rope: bool = True, + **kwargs: Any, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_size = patch_size + self.image_size = image_size + self.hidden_act = hidden_act + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_stride = hidden_stride + self.window_size = window_size + self.fullatt_block_indexes = fullatt_block_indexes + self.temporal_patch_size = temporal_patch_size + self.preserve_original_pe = preserve_original_pe + self.use_rope = use_rope + +class Ovis2_5_Config(PretrainedConfig): + model_type = "ovis2_5" + sub_configs = dict(llm_config=Qwen3Config, vit_config=Siglip2NavitConfig) + + def __init__(self, + llm_config: Optional[Union[Qwen3Config, dict]] = None, + vit_config: Optional[Union[Siglip2NavitConfig, dict]] = None, + visual_vocab_size=65536, + hidden_size=None, + **kwargs + ): + super().__init__(**kwargs) + if isinstance(llm_config, dict): + llm_config = Qwen3Config(**llm_config) + self.llm_config = llm_config + if isinstance(vit_config, dict): + vit_config = Siglip2NavitConfig(**vit_config) + self.vit_config = vit_config + self.visual_vocab_size = visual_vocab_size + self.hidden_size = hidden_size + if kwargs.get('attn_implementation'): + self.llm_config._attn_implementation = kwargs['attn_implementation'] + self.vit_config._attn_implementation = kwargs['attn_implementation'] diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..d44b45b --- /dev/null +++ b/generation_config.json @@ -0,0 +1,15 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "multimodal_max_length": 8192, + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.6, + "top_k": 20, + "top_p": 0.95, + "transformers_version": "4.51.3" +} diff --git a/merges.txt b/merges.txt new file mode 100644 index 0000000..80c1a19 --- /dev/null +++ b/merges.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8831e4f1a044471340f7c0a83d7bd71306a5b867e95fd870f74d0c5308a904d5 +size 1671853 diff --git a/model-00001-of-00002.safetensors b/model-00001-of-00002.safetensors new file mode 100644 index 0000000..8d023b3 --- /dev/null +++ b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bd7cec79fedabcd8149c62934fdffff7d840de111e8a17335d346bcbeba0c12 +size 4872524976 diff --git a/model-00002-of-00002.safetensors b/model-00002-of-00002.safetensors new file mode 100644 index 0000000..65a0919 --- /dev/null +++ b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bb2d37fcbf03254938b72d4bf88da9f84c8c0307b4f7d6ee79aa0a3c2f7732d +size 268435576 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..0c14295 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,758 @@ +{ + "metadata": { + "total_size": 5140859344 + }, + "weight_map": { + "llm.model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "llm.model.norm.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.head.0.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.head.1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.head.1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors", + "visual_tokenizer.vit.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors", + "vte.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/modeling_ovis2_5.py b/modeling_ovis2_5.py new file mode 100644 index 0000000..97284cb --- /dev/null +++ b/modeling_ovis2_5.py @@ -0,0 +1,1004 @@ +import math +from typing import Dict, List, Optional, Tuple, Union + +import PIL.Image +import numpy as np +import torch +from torch import Tensor, nn +from torch.nn import functional as F +from transformers import ( + AutoConfig, + AutoImageProcessor, + AutoModel, + AutoModelForCausalLM, + AutoTokenizer, +) +from transformers.activations import ACT2FN +from transformers.generation.utils import GenerateOutput +from transformers.modeling_outputs import BaseModelOutputWithNoAttention +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import is_flash_attn_2_available + +from .configuration_ovis2_5 import Siglip2NavitConfig, Ovis2_5_Config + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_varlen_func + from flash_attn.layers.rotary import apply_rotary_emb + + +IMAGE_PLACEHOLDER = "" +IMAGE_PLACEHOLDER_ID = -200 +VIDEO_PLACEHOLDER = "