初始化项目,由ModelHub XC社区提供模型

Model: prithivMLmods/Gliese-OCR-7B-Post1.0
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-06-15 07:16:12 +08:00
commit 71401d722d
21 changed files with 2234 additions and 0 deletions

55
.gitattributes vendored Normal file
View File

@@ -0,0 +1,55 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bin.* filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zstandard filter=lfs diff=lfs merge=lfs -text
*.tfevents* filter=lfs diff=lfs merge=lfs -text
*.db* filter=lfs diff=lfs merge=lfs -text
*.ark* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.gguf* filter=lfs diff=lfs merge=lfs -text
*.ggml filter=lfs diff=lfs merge=lfs -text
*.llamafile* filter=lfs diff=lfs merge=lfs -text
*.pt2 filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text
model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
vocab.json filter=lfs diff=lfs merge=lfs -text
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
merges.txt filter=lfs diff=lfs merge=lfs -text
model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text

View File

@@ -0,0 +1,401 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "DgpubXociwNK"
},
"source": [
"## **Gliese-OCR-7B-Post1.0(4-bit)**"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Nb3wNhothvX7"
},
"source": [
"The Gliese-OCR-7B-Post1.0 model is a fine-tuned version of Camel-Doc-OCR-062825, optimized for Document Retrieval, Content Extraction, and Analysis Recognition. Built on top of the Qwen2.5-VL architecture, this model enhances document comprehension capabilities with focused training on the Opendoc2-Analysis-Recognition dataset for superior document analysis and information extraction tasks.\n",
"\n",
" > This model shows significant improvements in LaTeX rendering and Markdown rendering for OCR tasks.\n",
"\n",
"| Image1 | Image2 |\n",
"|--------|--------|\n",
"| ![Screenshot 2025-08-30 at 12-50-11 Gradio.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/sZj3Gx32ICpm2lAVhmY_y.png) | ![Screenshot 2025-08-30 at 12-49-41 (anonymous) - output_426f8ad8-53ee-4609-9d55-6629ac37b055.pdf.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/ywaoJWmkDgjbJXVR_hsZO.png) |\n",
"\n",
"*multimodal model & notebook by: [prithivMLmods](https://huggingface.co/prithivMLmods)*"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Mk560Wx0j6PY"
},
"source": [
"### **Install packages**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qTD_dNliNS5T"
},
"outputs": [],
"source": [
"%%capture\n",
"!pip install git+https://github.com/huggingface/transformers.git \\\n",
" git+https://github.com/huggingface/accelerate.git \\\n",
" git+https://github.com/huggingface/peft.git \\\n",
" transformers-stream-generator huggingface_hub albumentations \\\n",
" pyvips-binary qwen-vl-utils sentencepiece opencv-python docling-core \\\n",
" python-docx torchvision safetensors matplotlib num2words \\\n",
"\n",
"!pip install xformers requests pymupdf hf_xet spaces pyvips pillow gradio \\\n",
" einops torch fpdf timm av decord bitsandbytes reportlab\n",
"#Hold tight, this will take around 1-2 minutes."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "uiBblyf-kLmf"
},
"source": [
"### **Run Demo App**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "pgz93DfvNMfb"
},
"outputs": [],
"source": [
"import spaces\n",
"import json\n",
"import math\n",
"import os\n",
"import traceback\n",
"from io import BytesIO\n",
"from typing import Any, Dict, List, Optional, Tuple\n",
"import re\n",
"import time\n",
"from threading import Thread\n",
"from io import BytesIO\n",
"import uuid\n",
"import tempfile\n",
"\n",
"import gradio as gr\n",
"import requests\n",
"import torch\n",
"from PIL import Image\n",
"import fitz\n",
"import numpy as np\n",
"\n",
"# --- New Model Imports ---\n",
"from transformers import (\n",
" Qwen2_5_VLForConditionalGeneration,\n",
" AutoProcessor,\n",
" TextIteratorStreamer,\n",
" BitsAndBytesConfig,\n",
")\n",
"\n",
"from reportlab.lib.pagesizes import A4\n",
"from reportlab.lib.styles import getSampleStyleSheet\n",
"from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
"from reportlab.lib.units import inch\n",
"\n",
"# --- Constants and Model Setup ---\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"\n",
"print(\"CUDA_VISIBLE_DEVICES=\", os.environ.get(\"CUDA_VISIBLE_DEVICES\"))\n",
"print(\"torch.__version__ =\", torch.__version__)\n",
"print(\"torch.version.cuda =\", torch.version.cuda)\n",
"print(\"cuda available:\", torch.cuda.is_available())\n",
"print(\"cuda device count:\", torch.cuda.device_count())\n",
"if torch.cuda.is_available():\n",
" print(\"current device:\", torch.cuda.current_device())\n",
" print(\"device name:\", torch.cuda.get_device_name(torch.cuda.current_device()))\n",
"\n",
"print(\"Using device:\", device)\n",
"\n",
"\n",
"# --- Model Loading (Updated for Qwen2.5-VL) ---\n",
"\n",
"# Define model options\n",
"MODEL_OPTIONS = {\n",
" \"Gliese-OCR-7B-Post1.0\": \"prithivMLmods/Gliese-OCR-7B-Post1.0\",\n",
"}\n",
"\n",
"# Define 4-bit quantization configuration\n",
"# This config will load the model in 4-bit to save VRAM.\n",
"quantization_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_compute_dtype=torch.float16,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_use_double_quant=True,\n",
")\n",
"\n",
"# Preload models and processors into CUDA\n",
"models = {}\n",
"processors = {}\n",
"for name, model_id in MODEL_OPTIONS.items():\n",
" print(f\"Loading {name}🤗. This will use 4-bit quantization to save VRAM.\")\n",
" models[name] = Qwen2_5_VLForConditionalGeneration.from_pretrained(\n",
" model_id,\n",
" trust_remote_code=True,\n",
" quantization_config=quantization_config,\n",
" device_map=\"auto\"\n",
" )\n",
" processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
"print(\"Model loaded successfully.\")\n",
"\n",
"\n",
"# --- PDF Generation and Preview Utility Function (Unchanged) ---\n",
"def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):\n",
" \"\"\"\n",
" Generates a PDF, saves it, and then creates image previews of its pages.\n",
" Returns the path to the PDF and a list of paths to the preview images.\n",
" \"\"\"\n",
" if image is None or not text_content or not text_content.strip():\n",
" raise gr.Error(\"Cannot generate PDF. Image or text content is missing.\")\n",
"\n",
" # --- 1. Generate the PDF ---\n",
" temp_dir = tempfile.gettempdir()\n",
" pdf_filename = os.path.join(temp_dir, f\"output_{uuid.uuid4()}.pdf\")\n",
" doc = SimpleDocTemplate(\n",
" pdf_filename,\n",
" pagesize=A4,\n",
" rightMargin=inch, leftMargin=inch,\n",
" topMargin=inch, bottomMargin=inch\n",
" )\n",
" styles = getSampleStyleSheet()\n",
" style_normal = styles[\"Normal\"]\n",
" style_normal.fontSize = int(font_size)\n",
" style_normal.leading = int(font_size) * line_spacing\n",
" style_normal.alignment = {\"Left\": 0, \"Center\": 1, \"Right\": 2, \"Justified\": 4}[alignment]\n",
"\n",
" story = []\n",
"\n",
" img_buffer = BytesIO()\n",
" image.save(img_buffer, format='PNG')\n",
" img_buffer.seek(0)\n",
"\n",
" page_width, _ = A4\n",
" available_width = page_width - 2 * inch\n",
" image_widths = {\n",
" \"Small\": available_width * 0.3,\n",
" \"Medium\": available_width * 0.6,\n",
" \"Large\": available_width * 0.9,\n",
" }\n",
" img_width = image_widths[image_size]\n",
" # Create a ReportLab Image object, handling potential transparency\n",
" img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))\n",
" story.append(img)\n",
" story.append(Spacer(1, 12))\n",
"\n",
" # Clean the text for PDF generation\n",
" cleaned_text = re.sub(r'#+\\s*', '', text_content).replace(\"*\", \"\")\n",
" text_paragraphs = cleaned_text.split('\\n')\n",
"\n",
" for para in text_paragraphs:\n",
" if para.strip():\n",
" story.append(Paragraph(para, style_normal))\n",
"\n",
" doc.build(story)\n",
"\n",
" # --- 2. Render PDF pages as images for preview ---\n",
" preview_images = []\n",
" try:\n",
" pdf_doc = fitz.open(pdf_filename)\n",
" for page_num in range(len(pdf_doc)):\n",
" page = pdf_doc.load_page(page_num)\n",
" pix = page.get_pixmap(dpi=150)\n",
" preview_img_path = os.path.join(temp_dir, f\"preview_{uuid.uuid4()}_p{page_num}.png\")\n",
" pix.save(preview_img_path)\n",
" preview_images.append(preview_img_path)\n",
" pdf_doc.close()\n",
" except Exception as e:\n",
" print(f\"Error generating PDF preview: {e}\")\n",
"\n",
" return pdf_filename, preview_images\n",
"\n",
"\n",
"# --- Core Application Logic (Updated for Qwen2.5-VL with Streaming) ---\n",
"@spaces.GPU\n",
"def process_document(\n",
" image: Image.Image,\n",
" prompt_input: str,\n",
" max_new_tokens: int,\n",
" temperature: float,\n",
" top_p: float,\n",
" top_k: int,\n",
" repetition_penalty: float\n",
"):\n",
" \"\"\"\n",
" Main function that handles model inference for the Qwen model with streaming.\n",
" This function is a generator, yielding text as it is generated.\n",
" \"\"\"\n",
" if image is None:\n",
" yield \"Please upload an image.\", \"Please upload an image.\"\n",
" return\n",
" if not prompt_input or not prompt_input.strip():\n",
" yield \"Please enter a prompt.\", \"Please enter a prompt.\"\n",
" return\n",
"\n",
" model_name = \"Gliese-OCR-7B-Post1.0\"\n",
" model = models[model_name]\n",
" processor = processors[model_name]\n",
"\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": [\n",
" {\"type\": \"image\", \"image\": image},\n",
" {\"type\": \"text\", \"text\": prompt_input},\n",
" ],\n",
" }\n",
" ]\n",
"\n",
" text = processor.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" )\n",
" inputs = processor(\n",
" text=[text],\n",
" images=[image],\n",
" padding=True,\n",
" return_tensors=\"pt\",\n",
" ).to(\"cuda\")\n",
"\n",
" streamer = TextIteratorStreamer(\n",
" processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
" )\n",
"\n",
" generation_kwargs = dict(\n",
" inputs,\n",
" streamer=streamer,\n",
" max_new_tokens=max_new_tokens,\n",
" temperature=temperature,\n",
" top_p=top_p,\n",
" top_k=top_k,\n",
" repetition_penalty=repetition_penalty,\n",
" do_sample=True if temperature > 0 else False,\n",
" )\n",
"\n",
" thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
" thread.start()\n",
"\n",
" buffer = \"\"\n",
" for new_text in streamer:\n",
" buffer += new_text\n",
" # Remove special tokens from the output stream\n",
" clean_buffer = buffer.replace(\"<|im_end|>\", \"\").replace(\"<|endoftext|>\", \"\")\n",
" yield clean_buffer, clean_buffer\n",
"\n",
"# --- Gradio UI Definition (Updated Title, otherwise unchanged) ---\n",
"def create_gradio_interface():\n",
" \"\"\"Builds and returns the Gradio web interface.\"\"\"\n",
" css = \"\"\"\n",
" .main-container { max-width: 1400px; margin: 0 auto; }\n",
" .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}\n",
" .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }\n",
" #gallery { min-height: 400px; }\n",
" \"\"\"\n",
" with gr.Blocks(theme=\"bethecloud/storj_theme\", css=css) as demo:\n",
" gr.HTML(f\"\"\"\n",
" <div class=\"title\" style=\"text-align: center\">\n",
" <h1>Gliese-OCR-7B-Post1.0 📄</h1>\n",
" <p style=\"font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;\">\n",
" Image Content Extraction and Markdown Rendering </b>\n",
" </p>\n",
" </div>\n",
" \"\"\")\n",
"\n",
" with gr.Row():\n",
" # Left Column (Inputs)\n",
" with gr.Column(scale=1):\n",
" prompt_input = gr.Textbox(label=\"Query Input\", placeholder=\"✦︎ Enter the prompt.\", value=\"Precisely OCR the Image.\")\n",
" image_input = gr.Image(label=\"Upload Image\", type=\"pil\", sources=['upload'])\n",
"\n",
" with gr.Accordion(\"Advanced Settings\", open=False):\n",
" max_new_tokens = gr.Slider(minimum=64, maximum=2048, value=1024, step=32, label=\"Max New Tokens\")\n",
" temperature = gr.Slider(label=\"Temperature\", minimum=0.1, maximum=2.0, step=0.1, value=0.7)\n",
" top_p = gr.Slider(label=\"Top-p (nucleus sampling)\", minimum=0.05, maximum=1.0, step=0.05, value=0.9)\n",
" top_k = gr.Slider(label=\"Top-k\", minimum=1, maximum=100, step=1, value=50)\n",
" repetition_penalty = gr.Slider(label=\"Repetition penalty\", minimum=1.0, maximum=2.0, step=0.05, value=1.1)\n",
"\n",
" with gr.Accordion(\"PDF Export Settings\", open=False):\n",
" font_size = gr.Dropdown(choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\"], value=\"12\", label=\"Font Size\")\n",
" line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label=\"Line Spacing\")\n",
" alignment = gr.Dropdown(choices=[\"Left\", \"Center\", \"Right\", \"Justified\"], value=\"Justified\", label=\"Text Alignment\")\n",
" image_size = gr.Dropdown(choices=[\"Small\", \"Medium\", \"Large\"], value=\"Medium\", label=\"Image Size in PDF\")\n",
"\n",
" process_btn = gr.Button(\"🚀 Process Image\", variant=\"primary\", elem_classes=[\"process-button\"], size=\"lg\")\n",
" clear_btn = gr.Button(\"🗑️ Clear All\", variant=\"secondary\")\n",
"\n",
" # Right Column (Outputs)\n",
" with gr.Column(scale=2):\n",
" with gr.Tabs() as tabs:\n",
" with gr.Tab(\"📝 Extracted Content\"):\n",
" raw_output = gr.Textbox(label=\"Model Output\", interactive=False, lines=15, show_copy_button=True)\n",
"\n",
" gr.Markdown(\"[prithivMLmods🤗](https://huggingface.co/prithivMLmods)\")\n",
"\n",
" with gr.Tab(\"📰 Markdown Preview\"):\n",
" with gr.Accordion(\"(Result.md)\", open=True):\n",
" markdown_output = gr.Markdown()\n",
"\n",
" with gr.Tab(\"📋 PDF Preview\"):\n",
" generate_pdf_btn = gr.Button(\"📄 Generate PDF & Render\", variant=\"primary\")\n",
" pdf_output_file = gr.File(label=\"Download Generated PDF\", interactive=False)\n",
" pdf_preview_gallery = gr.Gallery(label=\"PDF Page Preview\", show_label=True, elem_id=\"gallery\", columns=2, object_fit=\"contain\", height=\"auto\")\n",
"\n",
" # Event Handlers\n",
" def clear_all_outputs():\n",
" return None, \"\", \"Model output will appear here.\", \"\", None, None\n",
"\n",
" # The .click() event will now stream the output from the generator function\n",
" process_btn.click(\n",
" fn=process_document,\n",
" inputs=[image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],\n",
" outputs=[raw_output, markdown_output]\n",
" )\n",
"\n",
" generate_pdf_btn.click(\n",
" fn=generate_and_preview_pdf,\n",
" inputs=[image_input, raw_output, font_size, line_spacing, alignment, image_size],\n",
" outputs=[pdf_output_file, pdf_preview_gallery]\n",
" )\n",
"\n",
" clear_btn.click(\n",
" clear_all_outputs,\n",
" outputs=[image_input, prompt_input, raw_output, markdown_output, pdf_output_file, pdf_preview_gallery]\n",
" )\n",
" return demo\n",
"\n",
"if __name__ == \"__main__\":\n",
" demo = create_gradio_interface()\n",
" # Use queue() for better handling of multiple users and streaming\n",
" demo.queue(max_size=20).launch(share=True, show_error=True)"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

File diff suppressed because one or more lines are too long

120
README.md Normal file
View File

@@ -0,0 +1,120 @@
---
license: apache-2.0
pipeline_tag: image-text-to-text
language:
- en
- zh
base_model:
- prithivMLmods/Camel-Doc-OCR-062825
library_name: transformers
tags:
- Document
- VLM
- OCR
- VL
- Camel
- Openpdf
- text-generation-inference
- Extraction
- Linking
- Markdown
- Document Digitization
- Intelligent Document Processing (IDP)
- Intelligent Word Recognition (IWR)
- Optical Mark Recognition (OMR)
---
![1.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/GNsuO5cpxz73RW7xlrYCU.png)
# **Gliese-OCR-7B-Post1.0**
> The **Gliese-OCR-7B-Post1.0** model is a fine-tuned version of **[Camel-Doc-OCR-062825](https://huggingface.co/prithivMLmods/Camel-Doc-OCR-062825)**, optimized for **Document Retrieval**, **Content Extraction**, and **Analysis Recognition**. Built on top of the Qwen2.5-VL architecture, this model enhances document comprehension capabilities with focused training on the Opendoc2-Analysis-Recognition dataset for superior document analysis and information extraction tasks.
> [!note]
This model shows significant improvements in [LaTeX rendering and Markdown rendering for OCR tasks](https://huggingface.co/prithivMLmods/Gliese-OCR-7B-Post1.0/blob/main/Gliese-OCR-7B-Post1.0(4-bit)-reportlab/Gliese_OCR_7B_Post1_0(4_bit)_reportlab.ipynb).
# Key Enhancements
* **Context-Aware Multimodal Extraction and Linking for Documents**: Advanced capability for understanding document context and establishing connections between multimodal elements within documents.
* **Enhanced Document Retrieval**: Designed to efficiently locate and extract relevant information from complex document structures and layouts.
* **Superior Content Extraction**: Optimized for precise extraction of structured and unstructured content from diverse document formats.
* **Analysis Recognition**: Specialized in recognizing and interpreting analytical content, charts, tables, and visual data representations.
* **State-of-the-Art Performance Across Resolutions**: Achieves competitive results on OCR and visual QA benchmarks such as DocVQA, MathVista, RealWorldQA, and MTVQA.
* **Video Understanding up to 20+ minutes**: Supports detailed comprehension of long-duration videos for content summarization, Q\&A, and multi-modal reasoning.
* **Visually-Grounded Device Interaction**: Enables mobile/robotic device operation via visual inputs and text-based instructions using contextual understanding and decision-making logic.
# Quick Start with Transformers
```python
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"prithivMLmods/Gliese-OCR-7B-Post1.0", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("prithivMLmods/Gliese-OCR-7B-Post1.0")
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
},
{"type": "text", "text": "Describe this image."},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
```
# Intended Use
This model is intended for:
* Context-aware multimodal extraction and linking for complex document structures.
* High-fidelity document retrieval and content extraction from various document formats.
* Analysis recognition of charts, graphs, tables, and visual data representations.
* Document-based question answering for educational and enterprise applications.
* Extraction and LaTeX formatting of mathematical expressions from printed or handwritten content.
* Retrieval and summarization from long documents, slides, and multi-modal inputs.
* Multilingual document analysis and structured content extraction for global use cases.
* Robotic or mobile automation with vision-guided contextual interaction.
# Limitations
* May show degraded performance on extremely low-quality or occluded images.
* Not optimized for real-time applications on low-resource or edge devices due to computational demands.
* Variable accuracy on uncommon or low-resource languages/scripts.
* Long video processing may require substantial memory and is not optimized for streaming applications.
* Visual token settings affect performance; suboptimal configurations can impact results.
* In rare cases, outputs may contain hallucinated or contextually misaligned information.

24
added_tokens.json Normal file
View File

@@ -0,0 +1,24 @@
{
"</tool_call>": 151658,
"<tool_call>": 151657,
"<|box_end|>": 151649,
"<|box_start|>": 151648,
"<|endoftext|>": 151643,
"<|file_sep|>": 151664,
"<|fim_middle|>": 151660,
"<|fim_pad|>": 151662,
"<|fim_prefix|>": 151659,
"<|fim_suffix|>": 151661,
"<|im_end|>": 151645,
"<|im_start|>": 151644,
"<|image_pad|>": 151655,
"<|object_ref_end|>": 151647,
"<|object_ref_start|>": 151646,
"<|quad_end|>": 151651,
"<|quad_start|>": 151650,
"<|repo_name|>": 151663,
"<|video_pad|>": 151656,
"<|vision_end|>": 151653,
"<|vision_pad|>": 151654,
"<|vision_start|>": 151652
}

7
chat_template.jinja Normal file
View File

@@ -0,0 +1,7 @@
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
You are a helpful assistant.<|im_end|>
{% endif %}<|im_start|>{{ message['role'] }}
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
{% endif %}

136
config.json Normal file
View File

@@ -0,0 +1,136 @@
{
"architectures": [
"Qwen2_5_VLForConditionalGeneration"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 3584,
"image_token_id": 151655,
"initializer_range": 0.02,
"intermediate_size": 18944,
"max_position_embeddings": 128000,
"max_window_layers": 28,
"model_type": "qwen2_5_vl",
"num_attention_heads": 28,
"num_hidden_layers": 28,
"num_key_value_heads": 4,
"pad_token_id": 151643,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_section": [
16,
24,
24
],
"rope_type": "default",
"type": "default"
},
"rope_theta": 1000000.0,
"sliding_window": 32768,
"text_config": {
"architectures": [
"Qwen2_5_VLForConditionalGeneration"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"hidden_act": "silu",
"hidden_size": 3584,
"image_token_id": null,
"initializer_range": 0.02,
"intermediate_size": 18944,
"layer_types": [
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention",
"full_attention"
],
"max_position_embeddings": 128000,
"max_window_layers": 28,
"model_type": "qwen2_5_vl_text",
"num_attention_heads": 28,
"num_hidden_layers": 28,
"num_key_value_heads": 4,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_section": [
16,
24,
24
],
"rope_type": "default",
"type": "default"
},
"rope_theta": 1000000.0,
"sliding_window": null,
"torch_dtype": "bfloat16",
"use_cache": true,
"use_sliding_window": false,
"video_token_id": null,
"vision_end_token_id": 151653,
"vision_start_token_id": 151652,
"vision_token_id": 151654,
"vocab_size": 152064
},
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.57.0.dev0",
"use_cache": true,
"use_sliding_window": false,
"video_token_id": 151656,
"vision_config": {
"depth": 32,
"fullatt_block_indexes": [
7,
15,
23,
31
],
"hidden_act": "silu",
"hidden_size": 1280,
"in_channels": 3,
"in_chans": 3,
"initializer_range": 0.02,
"intermediate_size": 3420,
"model_type": "qwen2_5_vl",
"num_heads": 16,
"out_hidden_size": 3584,
"patch_size": 14,
"spatial_merge_size": 2,
"spatial_patch_size": 14,
"temporal_patch_size": 2,
"tokens_per_second": 2,
"torch_dtype": "bfloat16",
"window_size": 112
},
"vision_end_token_id": 151653,
"vision_start_token_id": 151652,
"vision_token_id": 151654,
"vocab_size": 152064
}

1
configuration.json Normal file
View File

@@ -0,0 +1 @@
{"framework": "pytorch", "task": "others", "allow_remote": true}

14
generation_config.json Normal file
View File

@@ -0,0 +1,14 @@
{
"attn_implementation": "flash_attention_2",
"bos_token_id": 151643,
"do_sample": true,
"eos_token_id": [
151645,
151643
],
"max_length": 128000,
"pad_token_id": 151643,
"repetition_penalty": 1.05,
"temperature": 1e-06,
"transformers_version": "4.54.0"
}

3
merges.txt Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b8e04af9c950e393054010dcef499c3f005e803e9b2da1a09b96a26c33266eb1
size 1823241

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ac9baf0d8f1bbd4c830305ee7b0ad0aabc627faf285264ad37dc3ac5328f4e82
size 4968243304

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:af86c26d0b592d3aa7fc4796062f131e4a99f307b21bff488e3248b391f36943
size 4991495816

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dee364fa951b2413688dca0fc0ce6ad8e2c65ccbda983f7da2fcb986f51ed78b
size 4932751040

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:864159ea1c100e793516b3b7ed806fe4c20ccd551956a97039bd5a0dfc070490
size 1691924384

View File

@@ -0,0 +1,736 @@
{
"metadata": {
"total_size": 16584333312
},
"weight_map": {
"lm_head.weight": "model-00004-of-00004.safetensors",
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
"model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
"model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
"model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
"model.norm.weight": "model-00004-of-00004.safetensors",
"visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
"visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
"visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
"visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
"visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
"visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
"visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
"visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
"visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
"visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
}
}

37
preprocessor_config.json Normal file
View File

@@ -0,0 +1,37 @@
{
"crop_size": null,
"data_format": "channels_first",
"default_to_square": true,
"device": null,
"disable_grouping": null,
"do_center_crop": null,
"do_convert_rgb": true,
"do_normalize": true,
"do_rescale": true,
"do_resize": true,
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_processor_type": "Qwen2VLImageProcessorFast",
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"input_data_format": null,
"max_pixels": 12845056,
"merge_size": 2,
"min_pixels": 3136,
"patch_size": 14,
"processor_class": "Qwen2_5_VLProcessor",
"resample": 3,
"rescale_factor": 0.00392156862745098,
"return_tensors": null,
"size": {
"longest_edge": 12845056,
"shortest_edge": 3136
},
"temporal_patch_size": 2
}

31
special_tokens_map.json Normal file
View File

@@ -0,0 +1,31 @@
{
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e04081d680d5bb294b2e57aea5b3aa1256d9e06263e907917fc241c5adc2fbe4
size 11422163

209
tokenizer_config.json Normal file
View File

@@ -0,0 +1,209 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"bos_token": null,
"clean_up_tokenization_spaces": false,
"eos_token": "<|im_end|>",
"errors": "replace",
"extra_special_tokens": {},
"model_max_length": 131072,
"pad_token": "<|endoftext|>",
"padding_side": "right",
"processor_class": "Qwen2_5_VLProcessor",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

View File

@@ -0,0 +1,43 @@
{
"crop_size": null,
"data_format": "channels_first",
"default_to_square": true,
"device": null,
"do_center_crop": null,
"do_convert_rgb": true,
"do_normalize": true,
"do_pad": null,
"do_rescale": true,
"do_resize": true,
"do_sample_frames": false,
"fps": null,
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"input_data_format": null,
"max_frames": 768,
"max_pixels": 12845056,
"merge_size": 2,
"min_frames": 4,
"min_pixels": 3136,
"num_frames": null,
"patch_size": 14,
"processor_class": "Qwen2_5_VLProcessor",
"resample": 3,
"rescale_factor": 0.00392156862745098,
"size": {
"longest_edge": 12845056,
"shortest_edge": 3136
},
"size_divisor": null,
"temporal_patch_size": 2,
"video_metadata": null,
"video_processor_type": "Qwen2VLVideoProcessor"
}

BIN
vocab.json (Stored with Git LFS) Normal file

Binary file not shown.