初始化项目，由ModelHub XC社区提供模型

Model: LLM-Research/Phi-3-vision-128k-instruct Source: Original Platform
2026-05-29 15:21:12 +08:00
commit d1f03baef2
22 changed files with 98309 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,9 @@
 # Microsoft Open Source Code of Conduct
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 Resources:
 - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
    MIT License
    Copyright (c) Microsoft Corporation.
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE
--- a/README.md
+++ b/README.md
@@ -0,0 +1,234 @@
 ---
 license: mit
 license_link: https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/LICENSE
 language:
 - multilingual
 pipeline_tag: text-generation
 tags:
 - nlp
 - code
 - vision
 inference:
  parameters:
    temperature: 0.7
 widget:
  - messages:
      - role: user
        content: <|image_1|>Can you describe what you see in the image?
 ---
 🎉 **Phi-3.5**: [[mini-instruct]](https://huggingface.co/microsoft/Phi-3.5-mini-instruct); [[MoE-instruct]](https://huggingface.co/microsoft/Phi-3.5-MoE-instruct) ; [[vision-instruct]](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)
 ## Model Summary
 The Phi-3-Vision-128K-Instruct is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision.  The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support. The model underwent a rigorous enhancement process, incorporating both supervised fine-tuning and direct preference optimization to ensure precise instruction adherence and robust safety measures.
 Resources and Technical Documentation:
 + [Phi-3 Microsoft Blog](https://aka.ms/Phi-3Build2024)
 + [Phi-3 Technical Report](https://aka.ms/phi3-tech-report)
 + [Phi-3 on Azure AI Studio](https://aka.ms/try-phi3vision)
 + [Phi-3 Cookbook](https://github.com/microsoft/Phi-3CookBook)
 |         | Short Context | Long Context |
 | ------- | ------------- | ------------ |
 | Mini    | 4K [[HF]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) ; [[GGUF]](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx)|
 | Small   | 8K [[HF]](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-small-8k-instruct-onnx-cuda) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-small-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-small-128k-instruct-onnx-cuda)|
 | Medium  | 4K [[HF]](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-cuda) | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-cuda)|
 | Vision  |  | 128K [[HF]](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) ; [[ONNX]](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct-onnx-cuda)|
 ## Intended Uses
 **Primary use cases**
 The model is intended for broad commercial and research use in English. The model provides uses for general purpose AI systems and applications with visual and text input capabilities which require 
 1) memory/compute constrained environments;
 2) latency bound scenarios;
 3) general image understanding;
 4) OCR;
 5) chart and table understanding.
 Our model is designed to accelerate research on efficient language and multimodal models, for use as a building block for generative AI powered features.
 **Use case considerations**
 Our models are not specifically designed or evaluated for all downstream purposes. Developers should consider common limitations of language models as they select use cases, and evaluate and mitigate for accuracy, safety, and fairness before using within a specific downstream use case, particularly for high-risk scenarios. 
 Developers should be aware of and adhere to applicable laws or regulations (including privacy, trade compliance laws, etc.) that are relevant to their use case. 
 Nothing contained in this Model Card should be interpreted as or deemed a restriction or modification to the license the model is released under.
 ## How to Use
 Phi-3-Vision-128K-Instruct has been integrated in the development version (4.40.2) of `transformers`. Until the official version is released through `pip`, ensure that you are doing one of the following:
 * When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
 * Update your local `transformers` to the development version: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`. The previous command is an alternative to cloning and installing from the source.
 The current `transformers` version can be verified with: `pip list | grep transformers`.
 Examples of required packages:
 ```
 flash_attn==2.5.8
 numpy==1.24.4
 Pillow==10.3.0
 Requests==2.31.0
 torch==2.3.0
 torchvision==0.18.0
 transformers==4.40.2
 ```
 Phi-3-Vision-128K-Instruct is also available in [Azure AI Studio](https://aka.ms/phi3-azure-ai).
 ### Chat Format
 Given the nature of the training data, the Phi-3-Vision-128K-Instruct model is best suited for a single image input wih prompts using the chat format as follows. 
 You can provide the prompt as a single image with a generic template as follow:
 ```markdown
 <|user|>\n<|image_1|>\n{prompt}<|end|>\n<|assistant|>\n 
 ```
 where the model generates the text after `<|assistant|>` . In case of multi-turn conversation, the prompt can be formatted as follows:
 ```markdown
 <|user|>\n<|image_1|>\n{prompt_1}<|end|>\n<|assistant|>\n{response_1}<|end|>\n<|user|>\n{prompt_2}<|end|>\n<|assistant|>\n 
 ```
 ### Sample inference code
 This code snippets show how to get quickly started with running the model on a GPU:
 ```python
 from PIL import Image 
 import requests 
 from transformers import AutoModelForCausalLM 
 from transformers import AutoProcessor 
 model_id = "microsoft/Phi-3-vision-128k-instruct" 
 model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') # use _attn_implementation='eager' to disable flash attention
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) 
 messages = [ 
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"}, 
    {"role": "assistant", "content": "The chart displays the percentage of respondents who agree with various statements about their preparedness for meetings. It shows five categories: 'Having clear and pre-defined goals for meetings', 'Knowing where to find the information I need for a meeting', 'Understanding my exact role and responsibilities when I'm invited', 'Having tools to manage admin tasks like note-taking or summarization', and 'Having more focus time to sufficiently prepare for meetings'. Each category has an associated bar indicating the level of agreement, measured on a scale from 0% to 100%."}, 
    {"role": "user", "content": "Provide insightful questions to spark discussion."} 
 ] 
 url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" 
 image = Image.open(requests.get(url, stream=True).raw) 
 prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0") 
 generation_args = { 
    "max_new_tokens": 500, 
    "temperature": 0.0, 
    "do_sample": False, 
 } 
 generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) 
 # remove input tokens 
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 
 print(response) 
 ```
 Additional basic examples are provided [here](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/sample_inference.py).
 ### How to finetune?
 We recommend user to take a look at the [Phi-3 CookBook finetuning recipe for Vision](https://github.com/microsoft/Phi-3CookBook/blob/main/md/04.Fine-tuning/FineTuning_Vision.md)
 ## Responsible AI Considerations
 Like other models, the Phi family of models can potentially behave in ways that are unfair, unreliable, or offensive. Some of the limiting behaviors to be aware of include:   
 + Quality of Service: The Phi models are trained primarily on English text. Languages other than English will experience worse performance. English language varieties with less representation in the training data might experience worse performance than standard American English.    
 + Representation of Harms & Perpetuation of Stereotypes: These models can over- or under-represent groups of people, erase representation of some groups, or reinforce demeaning or negative stereotypes. Despite safety post-training, these limitations may still be present due to differing levels of representation of different groups or prevalence of examples of negative stereotypes in training data that reflect real-world patterns and societal biases.  
 + Inappropriate or Offensive Content: These models may produce other types of inappropriate or offensive content, which may make it inappropriate to deploy for sensitive contexts without additional mitigations that are specific to the use case.  
 + Information Reliability: Language models can generate nonsensical content or fabricate content that might sound reasonable but is inaccurate or outdated.   
 + Limited Scope for Code: Majority of Phi-3 training data is based in Python and use common packages such as "typing, math, random, collections, datetime, itertools". If the model generates Python scripts that utilize other packages or scripts in other languages, we strongly recommend users manually verify all API uses.      
 Developers should apply responsible AI best practices and are responsible for ensuring that a specific use case complies with relevant laws and regulations (e.g. privacy, trade, etc.). Important areas for consideration include:  
 + Allocation: Models may not be suitable for scenarios that could have consequential impact on legal status or the allocation of resources or life opportunities (ex: housing, employment, credit, etc.) without further assessments and additional debiasing techniques.
 + High-Risk Scenarios: Developers should assess suitability of using models in high-risk scenarios where unfair, unreliable or offensive outputs might be extremely costly or lead to harm. This includes providing advice in sensitive or expert domains where accuracy and reliability are critical (ex: legal or health advice). Additional safeguards should be implemented at the application level according to the deployment context.
 + Misinformation: Models may produce inaccurate information. Developers should follow transparency best practices and inform end-users they are interacting with an AI system. At the application level, developers can build feedback mechanisms and pipelines to ground responses in use-case specific, contextual information, a technique known as Retrieval Augmented Generation (RAG).
 + Generation of Harmful Content: Developers should assess outputs for their context and use available safety classifiers or custom solutions appropriate for their use case.
 + Misuse: Other forms of misuse such as fraud, spam, or malware production may be possible, and developers should ensure that their applications do not violate applicable laws and regulations.
 + Identification of individuals: models with vision capabilities may have the potential to uniquely identify individuals in images. Safety post-training steers the model to refuse such requests, but developers should consider and implement, as appropriate, additional mitigations or user consent flows as required in their respective jurisdiction, (e.g., building measures to blur faces in image inputs before processing.
 ## Training
 ### Model
 * Architecture: Phi-3-Vision-128K-Instruct has 4.2B parameters and contains image encoder, connector, projector, and Phi-3 Mini language model.
 * Inputs: Text and Image. It’s best suited for prompts using the chat format. 
 * Context length: 128K tokens
 * GPUs: 512 H100-80G
 * Training time: 1.5 days
 * Training data: 500B vision and text tokens
 * Outputs: Generated text in response to the input
 * Dates: Our models were trained between February and April 2024
 * Status: This is a static model trained on an offline text dataset with cutoff date Mar 15, 2024. Future versions of the tuned models may be released as we improve models.
 * Release Type: Open weight release
 * Release dates: The model weight is released on May 21, 2024.
 ### Datasets
 Our training data includes a wide variety of sources, and is a combination of 
 1) publicly available documents filtered rigorously for quality, selected high-quality educational data and code;
 2) selected high-quality image-text interleave;
 3) newly created synthetic, “textbook-like” data for the purpose of teaching math, coding, common sense reasoning, general knowledge of the world (science, daily activities, theory of mind, etc.), newly created image data, e.g., chart/table/diagram/slides;
 4) high quality chat format supervised data covering various topics to reflect human preferences on different aspects such as instruct-following, truthfulness, honesty and helpfulness.
 The data collection process involved sourcing information from publicly available documents, with a meticulous approach to filtering out undesirable documents and images. To safeguard privacy, we carefully filtered various image and text data sources to remove or scrub any potentially personal data from the training data.
 More details can be found in the [Phi-3 Technical Report](https://aka.ms/phi3-tech-report).
 ## Benchmarks
 To understand the capabilities, we compare Phi-3-Vision-128K-Instruct with a set of models over a variety of zero-shot benchmarks using our internal benchmark platform.
 |Benchmark|Phi-3 Vision-128K-In|LlaVA-1.6 Vicuna-7B|QWEN-VL Chat|Llama3-Llava-Next-8B|Claude-3 Haiku|Gemini 1.0 Pro V|GPT-4V-Turbo|
 |---------|---------------------|------------------|------------|--------------------|--------------|----------------|------------|
 |MMMU|40.4|34.2|39.0|36.4|40.7|42.0|55.5| 
 |MMBench|80.5|76.3|75.8|79.4|62.4|80.0|86.1|
 |ScienceQA|90.8|70.6|67.2|73.7|72.0|79.7|75.7|
 |MathVista|44.5|31.5|29.4|34.8|33.2|35.0|47.5|
 |InterGPS|38.1|20.5|22.3|24.6|32.1|28.6|41.0|
 |AI2D|76.7|63.1|59.8|66.9|60.3|62.8|74.7|
 |ChartQA|81.4|55.0|50.9|65.8|59.3|58.0|62.3|
 |TextVQA|70.9|64.6|59.4|55.7|62.7|64.7|68.1|
 |POPE|85.8|87.2|82.6|87.0|74.4|84.2|83.7|
 ## Software
 * [PyTorch](https://github.com/pytorch/pytorch)
 * [Transformers](https://github.com/huggingface/transformers)
 * [Flash-Attention](https://github.com/HazyResearch/flash-attention)
 ## Hardware
 Note that by default, the Phi-3-Vision-128K model uses flash attention, which requires certain types of GPU hardware to run. We have tested on the following GPU types:
 * NVIDIA A100
 * NVIDIA A6000
 * NVIDIA H100
 ## License
 The model is licensed under the [MIT license](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/LICENSE).
 ## Trademarks
 This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft trademarks or logos is subject to and must follow [Microsoft’s Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party’s policies.
 ## Data Summary
 https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/data_summary_card.md
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -0,0 +1,41 @@
 <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 ## Security
 Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 ## Reporting Security Issues
 **Please do not report security vulnerabilities through public GitHub issues.**
 Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
 If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
 You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
 Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
  * Full paths of source file(s) related to the manifestation of the issue
  * The location of the affected source code (tag/branch/commit or direct URL)
  * Any special configuration required to reproduce the issue
  * Step-by-step instructions to reproduce the issue
  * Proof-of-concept or exploit code (if possible)
  * Impact of the issue, including how an attacker might exploit the issue
 This information will help us triage your report more quickly.
 If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
 ## Preferred Languages
 We prefer all communications to be in English.
 ## Policy
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
--- a/SUPPORT.md
+++ b/SUPPORT.md
@@ -0,0 +1,25 @@
 # TODO: The maintainer of this repo has not yet edited this file
 **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 - **No CSS support:** Fill out this template with information about how to file issues and get help.
 - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
 # Support
 ## How to file issues and get help  
 This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
 issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
 feature request as a new Issue.
 For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
 FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
 CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
 ## Microsoft Support Policy  
 Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
--- a/config.json
+++ b/config.json
@@ -0,0 +1,148 @@
 {
  "_name_or_path": "Phi-3-vision-128k-instruct",
  "architectures": [
    "Phi3VForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "configuration_phi3_v.Phi3VConfig",
    "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM"
  },
  "bos_token_id": 1,
  "embd_layer": {
    "embedding_cls": "image",
    "hd_transform_order": "sub_glb",
    "projection_cls": "mlp",
    "use_hd_transform": true,
    "with_learnable_separator": true
  },
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "img_processor": {
    "image_dim_out": 1024,
    "model_name": "openai/clip-vit-large-patch14-336",
    "name": "clip_vision_model",
    "num_img_tokens": 144
  },
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3_v",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "original_max_position_embeddings": 4096,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "long_factor": [
      1.0299999713897705,
      1.0499999523162842,
      1.0499999523162842,
      1.0799999237060547,
      1.2299998998641968,
      1.2299998998641968,
      1.2999999523162842,
      1.4499999284744263,
      1.5999999046325684,
      1.6499998569488525,
      1.8999998569488525,
      2.859999895095825,
      3.68999981880188,
      5.419999599456787,
      5.489999771118164,
      5.489999771118164,
      9.09000015258789,
      11.579999923706055,
      15.65999984741211,
      15.769999504089355,
      15.789999961853027,
      18.360000610351562,
      21.989999771118164,
      23.079999923706055,
      30.009998321533203,
      32.35000228881836,
      32.590003967285156,
      35.56000518798828,
      39.95000457763672,
      53.840003967285156,
      56.20000457763672,
      57.95000457763672,
      59.29000473022461,
      59.77000427246094,
      59.920005798339844,
      61.190006256103516,
      61.96000671386719,
      62.50000762939453,
      63.3700065612793,
      63.48000717163086,
      63.48000717163086,
      63.66000747680664,
      63.850006103515625,
      64.08000946044922,
      64.760009765625,
      64.80001068115234,
      64.81001281738281,
      64.81001281738281
    ],
    "short_factor": [
      1.05,
      1.05,
      1.05,
      1.1,
      1.1,
      1.1,
      1.2500000000000002,
      1.2500000000000002,
      1.4000000000000004,
      1.4500000000000004,
      1.5500000000000005,
      1.8500000000000008,
      1.9000000000000008,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.000000000000001,
      2.1000000000000005,
      2.1000000000000005,
      2.2,
      2.3499999999999996,
      2.3499999999999996,
      2.3499999999999996,
      2.3499999999999996,
      2.3999999999999995,
      2.3999999999999995,
      2.6499999999999986,
      2.6999999999999984,
      2.8999999999999977,
      2.9499999999999975,
      3.049999999999997,
      3.049999999999997,
      3.049999999999997
    ],
    "type": "su"
  },
  "rope_theta": 10000.0,
  "sliding_window": 131072,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.1",
  "use_cache": true,
  "vocab_size": 32064,
  "_attn_implementation": "flash_attention_2"
 }
--- a/configuration.json
+++ b/configuration.json
@@ -0,0 +1 @@
 {"framework": "pytorch", "task": "text-generation", "allow_remote": true}
--- a/configuration_phi3_v.py
+++ b/configuration_phi3_v.py
@@ -0,0 +1,217 @@
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Phi-3-V model configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
 }
 class Phi3VConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Phi3VModel`]. It is used to instantiate a Phi-3
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the
    [microsoft/Phi-3-vision-128k-instruct](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct).
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 32064):
            Vocabulary size of the Phi-3-V model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Phi3VModel`].
        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            Dropout probability for mlp outputs.
        embd_pdrop (`int`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio after computing the attention scores.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used with.
        original_max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model was trained with. This is used to determine the size of the
            original RoPE embeddings when using long scaling.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon value used for the RMSNorm.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`dict`, *optional*):
            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
            the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
            divided by the number of attention heads divided by 2.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 32000):
            The id of the "end-of-sequence" token.
        pad_token_id (`int`, *optional*, defaults to 32000):
            The id of the padding token.
        sliding_window (`int`, *optional*):
            Sliding window attention window size. If `None`, no sliding window is applied.
        embd_layer (`str`, *optional*, defaults to `"default"`):
            The embedding layer to use. Can be either `"default"` or `"image"`. "default" uses the standard embedding for text. 
    Example:
    ```python
    >>> from transformers import Phi3VModel, Phi3VConfig
    >>> # Initializing a Phi-3-V style configuration
    >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-vision-128k-instruct")
    >>> # Initializing a model from the configuration
    >>> model = Phi3VModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "phi3_v"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
        self,
        vocab_size=32064,
        hidden_size=3072,
        intermediate_size=8192,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        resid_pdrop=0.0,
        embd_pdrop=0.0,
        attention_dropout=0.0,
        hidden_act="silu",
        max_position_embeddings=4096,
        original_max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-5,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        bos_token_id=1,
        eos_token_id=32000,
        pad_token_id=32000,
        sliding_window=None,
        embd_layer: str = "default",
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attention_dropout = attention_dropout
        self.hidden_act = hidden_act
        self.max_position_embeddings = max_position_embeddings
        self.original_max_position_embeddings = original_max_position_embeddings
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self._rope_scaling_validation()
        self.sliding_window = sliding_window
        self.embd_layer = embd_layer
        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
    def _rope_scaling_validation(self):
        """
        Validate the `rope_scaling` configuration.
        """
        if self.rope_scaling is None:
            return
        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
            raise ValueError(
                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
                f"got {self.rope_scaling}"
            )
        rope_scaling_type = self.rope_scaling.get("type", None)
        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
        if not (
            isinstance(rope_scaling_short_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
            )
        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
            raise ValueError(
                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
            )
        if not (
            isinstance(rope_scaling_long_factor, list)
            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
        ):
            raise ValueError(
                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
            )
        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
            raise ValueError(
                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
            )
--- a/data_summary_card.md
+++ b/data_summary_card.md
@@ -0,0 +1,149 @@
 # Data Summary for Phi-3-vision-128k-instruct, Phi-3.5-vision-instruct 
 ## 1. General information 
 **1.0.1 Version of the Summary:** 1.0 
 **1.0.2 Last update:** 10-Dec-2025 
 ## 1.1 Model Developer Identification 
 **1.1.1 Model Developer name and contact details:** Microsoft Corporation at One Microsoft Way, Redmond, WA 98052. Tel: 425-882-8080. 
 ## 1.2 Model Identification 
 **1.2.1 Versioned model name(s):** Phi-3-Vision-128K-Instruct, Phi-3.5-vision-instruct 
 **1.2.2 Model release date:** 21-May-2024  
 ## 1.3 Overall training data size and characteristics 
 ### 1.3.1 Size of dataset and characteristics 
 **1.3.1.A Text training data size:** 1 billion to 10 trillion tokens 
 **1.3.1.B Text training data content:** Our training data includes a wide variety of sources, and is a combination of publicly available documents selected for quality, selected educational data and code; selected image-text interleave; newly created synthetic, “textbook-like” data for the purpose of teaching math, coding, common sense reasoning, general knowledge of the world (science, daily activities, theory of mind, etc.); chat format supervised data covering various topics to reflect preferences on different aspects such as instruct-following, truthfulness, honesty and helpfulness. 
 **1.3.1.C Image training data size:** 1 million to 1 billion images 
 **1.3.1.D Image training data content:** Selected image-text interleaved data and newly created image data including charts, tables, diagrams, and slides, filtered from publicly available sources for quality and safety 
 **1.3.1.E Audio training data size:** Not applicable 
 **1.3.1.F Audio training data content:** Not applicable 
 **1.3.1.G Video training data size:** Not applicable 
 **1.3.1.H Video training data content:** Not applicable 
 **1.3.1.I Other training data size:** Not applicable 
 **1.3.1.J Other training data content:** Not applicable 
 **1.3.2 Latest date of data acquisition/collection for model training:** 15-Mar-2024 
 **1.3.3 Is data collection ongoing to update the model with new data collection after deployment?** No 
 **1.3.4 Date the training dataset was first used to train the model:** 01-Feb-2024 
 **1.3.5 Rationale or purpose of data selection:** Datasets were selected to maximize reasoning-dense coverage across text and vision for general-purpose multimodal understanding, including math, coding, common sense reasoning, and chart/table/diagram interpretation, supporting efficient deployment in constrained environments 
 ## 2. List of data sources 
 ### 2.1 Publicly available datasets 
 **2.1.1 Have you used publicly available datasets to train the model?** Yes 
 ## 2.2 Private non-publicly available datasets obtained from third parties 
 ### 2.2.1 Datasets commercially licensed by rights holders or their representatives 
 **2.2.1.A Have you concluded transactional commercial licensing agreement(s) with rights holder(s) or with their representatives?** Not applicable 
 ### 2.2.2 Private datasets obtained from other third-parties 
 **2.2.2.A Have you obtained private datasets from third parties that are not licensed as described in Section 2.2.1, such as data obtained from providers of private databases, or data intermediaries?** No 
 ## 2.3 Personal Information 
 **2.3.1 Was personal data used to train the model?** Microsoft follows all relevant laws and regulations pertaining to personal information. 
 ## 2.4 Synthetic data 
 **2.4.1 Was any synthetic AI-generated data used to train the model?** Yes 
 ## 3. Data processing aspects 
 ### 3.1 Respect of reservation of rights from text and data mining exception or limitation 
 **3.1.1 Does this dataset include any data protected by copyright, trademark, or patent?** Microsoft follows all required regulations and laws for processing data protected by copyright, trademark, or patent. 
 ## 3.2 Other information 
 **3.2.1 Does the dataset include information about consumer groups without revealing individual consumer identities?** Microsoft follows all required regulations and laws for protecting consumer identities. 
 **3.2.2 Was the dataset cleaned or modified before model training?** Yes 
--- a/image_embedding_phi3_v.py
+++ b/image_embedding_phi3_v.py
@@ -0,0 +1,331 @@
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import warnings
 import torch
 from torch import nn
 from transformers import CLIPVisionConfig, CLIPVisionModel, PretrainedConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 from transformers.utils import logging
 try:
    from flash_attn import flash_attn_func
 except ImportError:
    pass
 logger = logging.get_logger(__name__)
 MAX_INPUT_ID = int(1e9)
 CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
  attention_dropout=0.0,
  dropout=0.0,
  hidden_act="quick_gelu",
  hidden_size=1024,
  image_size=336,
  initializer_factor=1.0,
  initializer_range=0.02,
  intermediate_size=4096,
  layer_norm_eps=1e-05,
  num_attention_heads=16,
  num_channels=3,
  num_hidden_layers=24,
  patch_size=14,
  projection_dim=768
 )
 class CLIPAttentionFA2(CLIPAttention):
    """Add flash attention 2 to CLIPAttention. (This is only used in the vision encoder)"""
    def forward(self,
        hidden_states,
        attention_mask=None,
        causal_attention_mask=None,
        output_attentions=False,
    ):
        """Input shape: Batch x Time x Channel"""
        assert attention_mask is None, "CLIPAttentionFA2 does not support attention_mask"
        assert causal_attention_mask is None, "CLIPAttentionFA2 does not support causal_attention_mask"
        assert output_attentions is False, "CLIPAttentionFA2 does not support output_attentions"
        bsz, tgt_len, embed_dim = hidden_states.size()
        query_states = self.q_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
        key_states = self.k_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
        value_states = self.v_proj(hidden_states).reshape(bsz, tgt_len, self.num_heads, self.head_dim)
        attn_output = flash_attn_func(
            query_states,
            key_states,
            value_states,
            dropout_p=self.dropout if self.training else 0.0,
            softmax_scale=self.scale,
            causal=False,
        ).reshape(bsz, tgt_len, embed_dim)
        attn_output = self.out_proj(attn_output)
        return attn_output, None
 class Phi3ImageEmbedding(nn.Module):
    """Phi3 Image embedding."""
    def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
        super().__init__()
        # n_embed or hidden_size
        hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
            embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
            self.drop = nn.Dropout(embd_drop)
        else:
            self.drop = None
        self.wte = wte
        if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
            assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
            assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
            assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
            assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
            clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
            self.img_processor = CLIPVisionModel(clip_config)
            image_dim_out = config.img_processor['image_dim_out']
            self.num_img_tokens = config.img_processor['num_img_tokens']
            # FA2 in CLIP
            if config._attn_implementation == 'flash_attention_2':
                for layer in self.img_processor.vision_model.encoder.layers:
                    clip_fa2 = CLIPAttentionFA2(clip_config)
                    del layer.self_attn
                    layer.self_attn = clip_fa2
        else:
            raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
        self.image_dim_out = image_dim_out
        self.img_sizes = None
        # global_gn and sub_gn for hd transform, serves as line separator
        self.use_hd_transform = kwargs.get('use_hd_transform', False)
        self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
        self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
        # with_hd_transform and with_learnable_separator should have same value
        assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
        if self.with_learnable_separator:
            assert self.use_hd_transform, 'learnable separator is only for hd transform'
            # 1024 * 4, merge spatial to channel dimension
            self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
            self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
            logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')
        projection_cls = kwargs.get('projection_cls', 'linear')
        if projection_cls == 'linear':
            self.img_projection = nn.Linear(image_dim_out, hidden_size)
        elif projection_cls == 'mlp' and self.use_hd_transform:
            dim_projection = hidden_size
            depth = 2
            layers = [nn.Linear(image_dim_out * 4, dim_projection)]
            for _ in range(1, depth):
                layers.extend([nn.GELU(),
                                nn.Linear(dim_projection, dim_projection)])
            self.img_projection = nn.Sequential(*layers)
        elif projection_cls == 'mlp':
            dim_projection = hidden_size
            depth = 2
            layers = [nn.Linear(image_dim_out, dim_projection)]
            for _ in range(1, depth):
                layers.extend([nn.GELU(),
                                nn.Linear(dim_projection, dim_projection)])
            self.img_projection = nn.Sequential(*layers)
        else:
            raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')
        self.vocab_size = config.vocab_size
        self.img_features = None
        if isinstance(config.img_processor, dict):
            self.layer_idx = config.img_processor.get('layer_idx', -2)
            self.type_feature = config.img_processor.get('type_feature', 'patch')
        else:
            self.layer_idx = -2
            self.type_feature = 'patch'
    def set_img_features(self, img_features: torch.FloatTensor) -> None:
        self.img_features = img_features
    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
        self.img_sizes = img_sizes
    def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
        LAYER_IDX = self.layer_idx
        TYPE_FEATURE = self.type_feature
        img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
        img_feature = img_processor_output.hidden_states[LAYER_IDX]
        if TYPE_FEATURE == "patch":
            patch_feature = img_feature[:, 1:]
            return patch_feature
        raise NotImplementedError
    def forward(
        self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None
    ) -> torch.FloatTensor:
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])
        # positions for image tokens
        positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=True)
        has_image = len(positions[0].tolist()) > 0
        # input_ids = input_ids.clamp_min(0).clamp_max(self.vocab_size).detach()
        input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
        warnings.warn(
            "Phi-3-V modifies `input_ids` in-place and the tokens indicating images will be "
            "removed after model forward. If your workflow requires multiple forward passes on "
            "the same `input_ids`, please make a copy of `input_ids` before passing it to the "
            "model."
        )
        hidden_states = self.wte(input_ids)
        if has_image:
            assert self.use_hd_transform
            num_images, num_crops, c, h, w = pixel_values.shape
            assert c == 3 and h == w == 336
            img_features = self.get_img_features(pixel_values.flatten(0, 1)).reshape(
                num_images, num_crops, -1, self.image_dim_out
            )
            image_features_proj = self.hd_feature_transform(img_features, image_sizes)
            hidden_states = hidden_states.index_put(
                positions, image_features_proj, accumulate=False
            )
        if self.drop is not None:
            hidden_states = self.drop(hidden_states)
        return hidden_states
    def hd_feature_transform(self, image_features, image_sizes):
        """
        image_features: (num_images, num_crops+1, 24*24, 1024)
        """
        assert (
            self.hd_transform_order == 'sub_glb'
        ), f'hd_transform_order `{self.hd_transform_order}` not implemented'
        if isinstance(self.img_projection, nn.Sequential):
            target_device = self.img_projection[0].bias.device
            target_dtype = self.img_projection[0].bias.dtype
        else:  # It's a single nn.Linear layer
            target_device = self.img_projection.bias.device
            target_dtype = self.img_projection.bias.dtype
        global_image_features = image_features[:, 0]  # (num_images, 24*24, 1024)
        # global feature can be viewed as a special HD case with num_crops 1x1
        global_image_features_hd = self.reshape_hd_patches_2x2merge(global_image_features, 1, 1)
        global_image_features_hd_newline = self.add_image_newline(global_image_features_hd)
        all_image_embeddings = []
        # need a for loop to process each image because of different image sizes
        # (patch arrangement is different for each image)
        for i, img_size in enumerate(image_sizes):
            h, w = img_size
            h_crop = h // 336
            w_crop = w // 336
            num_crops = h_crop * w_crop
            # NOTE: real num_crops is padded
            # (num_crops, 24*24, 1024)
            sub_image_features = image_features[i, 1 : 1 + num_crops]
            sub_image_features_hd = self.reshape_hd_patches_2x2merge(
                sub_image_features, h_crop, w_crop
            )
            sub_image_features_hd_newline = self.add_image_newline(sub_image_features_hd)
            # [sub features, separator, global features]
            all_image_embeddings.extend(
                [
                    sub_image_features_hd_newline.squeeze(0),  # (h_crop*12*(w_crop*12+1), 4096)
                    self.glb_GN.squeeze(0),
                    global_image_features_hd_newline[i],
                ]
            )
        image_features_proj = self.img_projection(
            torch.cat(all_image_embeddings, dim=0).to(target_device).to(target_dtype)
        )
        return image_features_proj
    def reshape_hd_patches_2x2merge(self, image_features, h_crop, w_crop):
        """
        image_features: (num_images*num_crops, 24*24, 1024)
        output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
        """
        N, L, C = image_features.shape
        assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
        num_images = N // (h_crop * w_crop)
        H = int(L**0.5)
        image_features_hd = (
            image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
            .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
            .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
            .reshape(N, -1, 4 * C)  # N, 144, 4096
            .reshape(
                num_images, h_crop, w_crop, H // 2, H // 2, -1
            )  # n_img, h_crop, w_crop, 12, 12, 4096
            .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
            .reshape(
                num_images, h_crop * H // 2, w_crop * H // 2, 4 * C
            )  # n_img, h_crop*12, w_crop*12, 4096
        )
        # alternative implementation using einops
        # from einops import rearrange
        # image_features_nhwc = rearrange(
        #     image_features,
        #     'N (H W) c -> N H W c',
        #     H=H,
        #     W=H,
        # )
        # image_features_2x2merge = rearrange(
        #     image_features_nhwc,
        #     'N (h h_pool) (w w_pool) c -> N h w (h_pool w_pool c)',
        #     h_pool=2,
        #     w_pool=2,
        # )
        # image_features_hd = rearrange(
        #     image_features_2x2merge,
        #     '(n_img h_crop w_crop) h w C -> n_img (h_crop h) (w_crop w) C',
        #     h_crop=h_crop,
        #     w_crop=w_crop,
        # )
        return image_features_hd
    def add_image_newline(self, image_features_hd):
        """
        image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
        output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
        """
        num_images, h, w, hid_dim = image_features_hd.shape
        # add the newline token to the HD image feature patches
        newline_embeddings = self.sub_GN.expand(num_images, h, -1, -1)  # (n_img, h, 1, hid_dim)
        image_features_hd_newline = torch.cat(
            [image_features_hd, newline_embeddings], dim=2
        ).reshape(num_images, -1, hid_dim)
        return image_features_hd_newline
--- a/image_processing_phi3_v.py
+++ b/image_processing_phi3_v.py
@@ -0,0 +1,274 @@
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Image processor class for Phi3-V."""
 from typing import List, Optional, Union
 import numpy as np
 from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import (
    convert_to_rgb,
 )
 from transformers.image_utils import (
    OPENAI_CLIP_MEAN,
    OPENAI_CLIP_STD,
    ImageInput,
    make_list_of_images,
    valid_images,
 )
 from transformers.utils import TensorType, is_vision_available, logging
 from transformers import AutoImageProcessor
 logger = logging.get_logger(__name__)
 if is_vision_available():
    from PIL import Image
 import torch
 import torchvision
 def padding_336(b):
    width, height = b.size
    tar = int(np.ceil(height / 336) * 336)
    top_padding = int((tar - height)/2)
    bottom_padding = tar - height - top_padding
    left_padding = 0
    right_padding = 0
    b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
    return b
 def calc_padded_size(width, height, padding_unit=336):  
    target_height = int(np.ceil(height / padding_unit) * padding_unit)  
    top_padding = int((target_height - height) / 2)  
    bottom_padding = target_height - height - top_padding  
    left_padding = 0  
    right_padding = 0  
    padded_width = width + left_padding + right_padding  
    padded_height = height + top_padding + bottom_padding  
    return padded_width, padded_height  
 def HD_transform(img, hd_num=16):
    width, height = img.size
    trans = False
    if width < height:
        img = img.transpose(Image.TRANSPOSE)
        trans = True
        width, height = img.size
    ratio = (width/ height)
    scale = 1
    while scale*np.ceil(scale/ratio) <= hd_num:
        scale += 1
    scale -= 1
    new_w = int(scale * 336)
    new_h = int(new_w / ratio)
    img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
    img = padding_336(img)
    width, height = img.size
    if trans:
        img = img.transpose(Image.TRANSPOSE)
    return img
 def calc_hd_transform_size(width, height, hd_num=16):  
    transposed = False  
    if width < height:  
        width, height = height, width  
        transposed = True  
    ratio = width / height  
    scale = 1  
    while scale * np.ceil(scale / ratio) <= hd_num:  
        scale += 1  
    scale -= 1  
    new_width = int(scale * 336)  
    new_height = int(new_width / ratio)  
    padded_width, padded_height = calc_padded_size(new_width, new_height)  
    if transposed:  
        padded_width, padded_height = padded_height, padded_width  
    return padded_width, padded_height  
 def pad_to_max_num_crops_tensor(images, max_crops=5):
    """
    images: B x 3 x H x W, B<=max_crops
    """
    B, _, H, W = images.shape
    if B < max_crops:
        pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
        images = torch.cat([images, pad], dim=0)
    return images
 class Phi3VImageProcessor(BaseImageProcessor):
    r"""
    Constructs a Phi3 image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
    for processing high resolution images as explained in the [InternLM-XComposer2-4KHD](https://arxiv.org/pdf/2404.06512)
    Args:
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    """
    model_input_names = ["pixel_values"]
    def __init__(
        self,
        num_crops: int = 1,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.num_crops = num_crops
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_convert_rgb = do_convert_rgb
    def calc_num_image_tokens(
            self, 
            images: ImageInput 
    ):
        """ Calculate the number of image tokens for each image.
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
        """
        images = make_list_of_images(images)
        if not valid_images(images):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )
        images = [image.convert('RGB') for image in images]
        # (H, W, C)
        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
        shapes = [[im.size[1], im.size[0]] for im in elems]
        num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
        return num_img_tokens
    def calc_num_image_tokens_from_image_size(self, width, height):
        """
        Calculate the number of image tokens for a given image size.
        Args:
            width (`int`): Width of the image.
            height (`int`): Height of the image.
        """
        new_width, new_height = calc_hd_transform_size(width, height, hd_num=self.num_crops)  
        num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)  
        return num_img_tokens
    def preprocess(
        self,
        images: ImageInput,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ):
        """
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
        """
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        images = make_list_of_images(images)
        if not valid_images(images):
            raise ValueError(
                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                "torch.Tensor, tf.Tensor or jax.ndarray."
            )
        if do_convert_rgb:
            images = [convert_to_rgb(image) for image in images]
        image_sizes = []
        img_processor = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize(image_mean, image_std)
        ])
        # PIL images
        # HD_transform pad images to size of multiiply of 336, 336
        # convert to RGB first
        images = [image.convert('RGB') for image in images]
        elems = [HD_transform(im, hd_num = self.num_crops) for im in images] 
        # tensor transform and normalize
        hd_images = [img_processor(im) for im in elems]
        # create global image 
        global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
        # [(3, h, w)], where h, w is multiple of 336
        shapes = [[im.size(1), im.size(2)] for im in hd_images]
        num_img_tokens = [int(((h//336)*(w//336)+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
        # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
        # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
        hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
        # concat global image and local image
        hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
        # pad to max_num_crops
        image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
        image_transformed = torch.stack(image_transformed, dim=0)
        image_sizes = [torch.LongTensor(_shapes) for _shapes in shapes]
        padded_images = image_transformed
        image_sizes = shapes
        data = {"pixel_values": padded_images, 
                "image_sizes": image_sizes,
                "num_img_tokens": num_img_tokens
                }
        return BatchFeature(data=data, tensor_type=return_tensors)
 AutoImageProcessor.register("Phi3VImageProcessor", Phi3VImageProcessor)
--- a/model-00001-of-00002.safetensors
+++ b/model-00001-of-00002.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:175b2fe918dd8bd2549e3441615ee0c6d7b1f6d638c0104a614546f55c273482
 size 4944122112
--- a/model-00002-of-00002.safetensors
+++ b/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e61ece5a8f0c9663afa06cc22799056f5cc084fb993518bf036dc8e268fd4c94
 size 3349208776
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,599 @@
 {
  "metadata": {
    "total_size": 8293242880
  },
  "weight_map": {
    "lm_head.weight": "model-00002-of-00002.safetensors",
    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.18.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.20.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
    "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
    "model.norm.weight": "model-00002-of-00002.safetensors",
    "model.vision_embed_tokens.glb_GN": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.embeddings.class_embedding": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_projection.0.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_projection.0.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_projection.2.bias": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.img_projection.2.weight": "model-00001-of-00002.safetensors",
    "model.vision_embed_tokens.sub_GN": "model-00001-of-00002.safetensors"
  }
 }
--- a/modeling_phi3_v.py
+++ b/modeling_phi3_v.py
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@@ -0,0 +1,20 @@
 {
    "auto_map": {
        "AutoProcessor": "processing_phi3_v.Phi3VProcessor",
        "AutoImageProcessor": "image_processing_phi3_v.Phi3VImageProcessor"
    },
    "num_crops": 16,
    "image_mean": [
      0.48145466,
      0.4578275,
      0.40821073
    ],
    "image_processor_type": "Phi3VImageProcessor",
    "image_std": [
      0.26862954,
      0.26130258,
      0.27577711
    ],
    "processor_class": "Phi3VProcessor",
    "num_img_tokens": 144
  }
--- a/processing_phi3_v.py
+++ b/processing_phi3_v.py
@@ -0,0 +1,217 @@
 # coding=utf-8
 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Processor class for Phi3-V.
 """
 import re
 from typing import List, Optional, Union
 import torch
 import transformers
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessorMixin
 from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
 from transformers.utils import TensorType
 from .image_processing_phi3_v import Phi3VImageProcessor 
 transformers.Phi3VImageProcessor = Phi3VImageProcessor 
 class Phi3VProcessor(ProcessorMixin):
    r"""
    Constructs a Phi3-V processor which wraps a Phi3-V image processor and a LLaMa tokenizer into a single processor.
    [`Phi3VProcessor`] offers all the functionalities of [`Phi3VImageProcessor`] and [`LlamaTokenizerFast`]. See the
    [`~Phi3VProcessor.__call__`] and [`~Phi3VProcessor.decode`] for more information.
    Args:
        image_processor ([`Phi3VImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`LlamaTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "Phi3VImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
    special_image_token = "<|image|>"
    def __init__(self, image_processor, tokenizer):
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.num_img_tokens = image_processor.num_img_tokens
        self.img_tokens = [f"<|image_{i+1}|>" for i in range(1000000)]
    def __call__(
        self,
        text: Union[TextInput, List[TextInput]],
        images: ImageInput = None,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = None,
        max_length=None,
        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
    ) -> BatchFeature:
        """
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
        Phi3ImageProcessor's [`~Phi3ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
        of the above two methods for more information.
        Args:
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. Both channels-first and channels-last formats are supported.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:
                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        """
        if images is not None:
            image_inputs = self.image_processor(images, return_tensors=return_tensors)
        else:
            image_inputs = {}
        inputs = self._convert_images_texts_to_inputs(image_inputs, text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors)
        return inputs
    def calc_num_image_tokens(self, images: ImageInput):
        """ Calculate the number of image tokens for each image.
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
        """
        return self.image_processor.calc_num_image_tokens(images)
    def calc_num_image_tokens_from_image_size(self, width, height):
        """ Calculate the number of image token for an image with given width and height.
        Args:
            width (`int`):
                Width of the image.
            height (`int`):
                Height of the image.
        """
        return self.image_processor.calc_num_image_tokens_from_image_size(width, height)
    @property 
    def special_image_token_id(self):
        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
    def get_special_image_token_id(self):
        return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
    def _convert_images_texts_to_inputs(self, images, texts, padding=False, truncation=None, max_length=None, return_tensors=None):
        if not len(images):
            model_inputs = self.tokenizer(texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length)
            return BatchFeature(data={**model_inputs})
        pattern = r"<\|image_\d+\|>"
        prompt_chunks = [self.tokenizer(chunk).input_ids for chunk in re.split(pattern, texts)] 
        if 'num_img_tokens' in images:
            num_img_tokens = images['num_img_tokens']
        else:
            assert 'num_crops' in images, 'num_crops must be provided in images if num_img_tokens is not provided'
            num_crops = images['num_crops']
            num_img_tokens = [_num_crops * self.num_img_tokens for _num_crops in num_crops] 
        images, image_sizes = images['pixel_values'], images['image_sizes']
        # image_tags needs to start from 1 to n
        image_tags = re.findall(pattern, texts) 
        # image_ids = [int(s.split("|")[1].split("_")[-1]) * -1 for s in image_tags]
        # image_ids_pad = [[iid]*num_img_tokens[i] for i, iid in enumerate(image_ids)]
        image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
        unique_image_ids = sorted(list(set(image_ids)))
        # image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be [1, 4, 5]
        # check the condition
        assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
        # total images must be the same as the number of image tags
        assert len(unique_image_ids) == len(images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(images)} images"
        image_ids_pad = [[-iid]*num_img_tokens[iid-1] for iid in image_ids]
        def insert_separator(X, sep_list):
            if len(X) > len(sep_list):
                sep_list.append([])
            return [ele for sublist in zip(X, sep_list) for ele in sublist]
        input_ids = []
        offset = 0                
        for x in insert_separator(prompt_chunks, image_ids_pad):
            input_ids.extend(x[offset:])
        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
        attention_mask = (input_ids > -1000000).to(torch.long)
        return BatchFeature(data={"input_ids": input_ids,
                                  "attention_mask": attention_mask,
                                  "pixel_values": images, 
                                  "image_sizes": image_sizes})
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)
    @property
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
--- a/sample_inference.py
+++ b/sample_inference.py
@@ -0,0 +1,108 @@
 from PIL import Image
 import requests
 import torch
 from transformers import AutoModelForCausalLM
 from transformers import AutoProcessor
 model_path = "./"
 kwargs = {}
 kwargs['torch_dtype'] = torch.bfloat16
 processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda()
 user_prompt = '<|user|>\n'
 assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 #################################################### text-only ####################################################
 prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
 print(f">>> Prompt\n{prompt}")
 inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
 generate_ids = model.generate(**inputs, 
                              max_new_tokens=1000,
                              eos_token_id=processor.tokenizer.eos_token_id,
                              )
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, 
                                  skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False)[0]
 print(f'>>> Response\n{response}')
 #################################################### text-only 2 ####################################################
 prompt = f"{user_prompt}Give me the code for sloving two-sum problem.{prompt_suffix}{assistant_prompt}"
 print(f">>> Prompt\n{prompt}")
 inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
 generate_ids = model.generate(**inputs, 
                              max_new_tokens=1000,
                              eos_token_id=processor.tokenizer.eos_token_id,
                              )
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, 
                                  skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False)[0]
 print(f'>>> Response\n{response}')
 #################################################### EXAMPLE 1 ####################################################
 # single-image prompt
 prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 print(f">>> Prompt\n{prompt}")
 image = Image.open(requests.get(url, stream=True).raw)
 inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
 generate_ids = model.generate(**inputs, 
                              max_new_tokens=1000,
                              eos_token_id=processor.tokenizer.eos_token_id,
                              )
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, 
                                  skip_special_tokens=True, 
                                  clean_up_tokenization_spaces=False)[0]
 print(f'>>> Response\n{response}')
 #################################################### EXAMPLE 2 ####################################################
 # chat template
 chat = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by."},
    {"role": "user", "content": "What is so special about this image"}
 ]
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
 # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
 if prompt.endswith("<|endoftext|>"):
    prompt = prompt.rstrip("<|endoftext|>")
 print(f">>> Prompt\n{prompt}")
 inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
 generate_ids = model.generate(**inputs, 
                              max_new_tokens=1000,
                              eos_token_id=processor.tokenizer.eos_token_id,
                              )
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 print(f'>>> Response\n{response}')
 ############################# to markdown #############################
 # single-image prompt
 prompt = f"{user_prompt}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
 url = "https://support.content.office.net/en-us/media/3dd2b79b-9160-403d-9967-af893d17b580.png"
 image = Image.open(requests.get(url, stream=True).raw)
 inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
 print(f">>> Prompt\n{prompt}")
 generate_ids = model.generate(**inputs, 
                              max_new_tokens=1000,
                              eos_token_id=processor.tokenizer.eos_token_id,
                              )
 generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
 response = processor.batch_decode(generate_ids, 
                                  skip_special_tokens=False, 
                                  clean_up_tokenization_spaces=False)[0]
 print(f'>>> Response\n{response}')
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,36 @@
 {
  "additional_special_tokens": [
    "<|system|>",
    "<|end|>",
    "<|user|>",
    "<|end|>"
  ],
  "bos_token": {
    "content": "<s>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "eos_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "unk_token": {
    "content": "<unk>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,408 @@
 {
  "add_bos_token": true,
  "add_eos_token": false,
  "added_tokens_decoder": {
    "0": {
      "content": "<unk>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "1": {
      "content": "<s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "2": {
      "content": "</s>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": false
    },
    "32000": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "32001": {
      "content": "<|assistant|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32002": {
      "content": "<|placeholder1|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32003": {
      "content": "<|placeholder2|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32004": {
      "content": "<|placeholder3|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32005": {
      "content": "<|placeholder4|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32006": {
      "content": "<|system|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "32007": {
      "content": "<|end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "32008": {
      "content": "<|placeholder5|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32009": {
      "content": "<|placeholder6|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32010": {
      "content": "<|user|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "32011": {
      "content": "<|placeholder7|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32012": {
      "content": "<|placeholder8|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32013": {
      "content": "<|placeholder9|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32014": {
      "content": "<|placeholder10|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32015": {
      "content": "<|placeholder11|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32016": {
      "content": "<|placeholder12|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32017": {
      "content": "<|placeholder13|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32018": {
      "content": "<|placeholder14|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32019": {
      "content": "<|placeholder15|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32020": {
      "content": "<|placeholder16|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32021": {
      "content": "<|placeholder17|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32022": {
      "content": "<|placeholder18|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32023": {
      "content": "<|placeholder19|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32024": {
      "content": "<|placeholder20|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32025": {
      "content": "<|placeholder21|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32026": {
      "content": "<|placeholder22|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32027": {
      "content": "<|placeholder23|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32028": {
      "content": "<|placeholder24|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32029": {
      "content": "<|placeholder25|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32030": {
      "content": "<|placeholder26|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32031": {
      "content": "<|placeholder27|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32032": {
      "content": "<|placeholder28|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32033": {
      "content": "<|placeholder29|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32034": {
      "content": "<|placeholder30|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32035": {
      "content": "<|placeholder31|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32036": {
      "content": "<|placeholder32|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32037": {
      "content": "<|placeholder33|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32038": {
      "content": "<|placeholder34|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32039": {
      "content": "<|placeholder35|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32040": {
      "content": "<|placeholder36|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32041": {
      "content": "<|placeholder37|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32042": {
      "content": "<|placeholder38|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32043": {
      "content": "<|placeholder39|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    },
    "32044": {
      "content": "<|image|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": true,
      "single_word": false,
      "special": true
    }
  },
  "additional_special_tokens": [
    "<|system|>",
    "<|end|>",
    "<|user|>",
    "<|end|>"
  ],
  "bos_token": "<s>",
  "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|endoftext|>",
  "legacy": false,
  "model_max_length": 131072,
  "pad_token": "<|endoftext|>",
  "padding_side": "right",
  "sp_model_kwargs": {},
  "tokenizer_class": "LlamaTokenizer",
  "unk_token": "<unk>",
  "use_default_system_prompt": false
 }
		`@@ -0,0 +1 @@`
							`{"framework": "pytorch", "task": "text-generation", "allow_remote": true}`