初始化项目，由ModelHub XC社区提供模型

Model: prithivMLmods/Hoags-2B-Exp Source: Original Platform
2026-05-21 01:12:12 +08:00
commit 544b40a98e
15 changed files with 152194 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,131 @@
 ---
 license: apache-2.0
 language:
 - en
 - zh
 base_model:
 - prithivMLmods/Qwen2-VL-OCR-2B-Instruct
 pipeline_tag: image-text-to-text
 library_name: transformers
 tags:
 - text-generation-inference
 - Qwen
 - Hoags
 ---
 ![sdefsed.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/hpDw305N-pMouN0EiuJYL.png)
 > [!WARNING]
 > **Note:**  This model contains artifacts and may perform poorly in some cases.
 # **Hoags-2B-Exp**
 The **Hoags-2B-Exp** model is a fine-tuned version of Qwen2-VL-2B-Instruct, specifically designed for reasoning tasks, context reasoning, and multi-modal understanding. If you ask for an image description, it will automatically describe the image and answer the question in a conversational manner.
 # **Key Enhancements**
 * **Advanced Contextual Reasoning**: Hoags-2B-Exp achieves state-of-the-art performance in reasoning tasks by enhancing logical inference and decision-making.
 * **Understanding images of various resolution & ratio**: The model excels at visual understanding benchmarks, including MathVista, DocVQA, RealWorldQA, MTVQA, etc.
 * **Long-Context Video Understanding**: Capable of processing and reasoning over videos of 20 minutes or more for high-quality video-based question answering, content creation, and dialogue.
 * **Device Integration**: With strong reasoning and decision-making abilities, the model can be integrated into mobile devices, robots, and automation systems for real-time operation based on both visual and textual input.
 * **Multilingual Support**: Supports text understanding in various languages within images, including English, Chinese, Japanese, Korean, Arabic, most European languages, and Vietnamese.
 # **Demo Inference**
 ![demo.png](https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/43w_tJW1-q93qHVegMhIX.png)
 # **How to Use**
 ```python
 instruction = "Analyze the image and generate a clear, concise description of the scene, objects, and actions. Respond to user queries with accurate, relevant details derived from the visual content. Maintain a natural conversational flow and ensure logical consistency. Summarize or clarify as needed for understanding."
 ```
 ```python
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # Load the model with automatic device placement
 model = Qwen2VLForConditionalGeneration.from_pretrained(
    "prithivMLmods/Hoags-2B-Exp", torch_dtype="auto", device_map="auto"
 )
 # Recommended: Enable flash_attention_2 for better performance in multi-image and video tasks
 # model = Qwen2VLForConditionalGeneration.from_pretrained(
 #     "prithivMLmods/Hoags-2B-Exp",
 #     torch_dtype=torch.bfloat16,
 #     attn_implementation="flash_attention_2",
 #     device_map="auto",
 # )
 # Load processor
 processor = AutoProcessor.from_pretrained("prithivMLmods/Hoags-2B-Exp")
 messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Analyze the context of this image."},
        ],
    }
 ]
 # Prepare input
 text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
 )
 image_inputs, video_inputs = process_vision_info(messages)
 inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
 )
 inputs = inputs.to("cuda")
 # Inference
 generated_ids = model.generate(**inputs, max_new_tokens=128)
 generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 ]
 output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )
 print(output_text)
 ```
 # **Buffer Handling**
 ```python
 buffer = ""
 for new_text in streamer:
    buffer += new_text
    buffer = buffer.replace("<|im_end|>", "")
    yield buffer
 ```
 # **Key Features**
 1. **Advanced Contextual Reasoning:**  
   - Optimized for **context-aware problem-solving** and **logical inference**.
 2. **Optical Character Recognition (OCR):**  
   - Extracts and processes text from images with exceptional accuracy.
 3. **Mathematical and Logical Problem Solving:**  
   - Supports complex reasoning and outputs equations in **LaTeX format**.
 4. **Conversational and Multi-Turn Interaction:**  
   - Handles **multi-turn dialogue** with enhanced memory retention and response coherence.
 5. **Multi-Modal Inputs & Outputs:**  
   - Processes images, text, and combined inputs to generate insightful analyses.
 6. **Secure and Efficient Model Loading:**  
   - Uses **Safetensors** for faster and more secure model weight handling.
--- a/added_tokens.json
+++ b/added_tokens.json
@@ -0,0 +1,16 @@
 {
  "<|box_end|>": 151649,
  "<|box_start|>": 151648,
  "<|endoftext|>": 151643,
  "<|im_end|>": 151645,
  "<|im_start|>": 151644,
  "<|image_pad|>": 151655,
  "<|object_ref_end|>": 151647,
  "<|object_ref_start|>": 151646,
  "<|quad_end|>": 151651,
  "<|quad_start|>": 151650,
  "<|video_pad|>": 151656,
  "<|vision_end|>": 151653,
  "<|vision_pad|>": 151654,
  "<|vision_start|>": 151652
 }
--- a/chat_template.json
+++ b/chat_template.json
@@ -0,0 +1,3 @@
 {
  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
 }
--- a/config.json
+++ b/config.json
@@ -0,0 +1,50 @@
 {
  "_name_or_path": "prithivMLmods/Hoags-2B-Exp",
  "architectures": [
    "Qwen2VLForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 1536,
  "image_token_id": 151655,
  "initializer_range": 0.02,
  "intermediate_size": 8960,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2_vl",
  "num_attention_heads": 12,
  "num_hidden_layers": 28,
  "num_key_value_heads": 2,
  "pad_token_id": 151654,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "mrope_section": [
      16,
      24,
      24
    ],
    "rope_type": "default",
    "type": "default"
  },
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.49.0.dev0",
  "use_cache": true,
  "use_sliding_window": false,
  "video_token_id": 151656,
  "vision_config": {
    "hidden_size": 1536,
    "in_chans": 3,
    "model_type": "qwen2_vl",
    "spatial_patch_size": 14,
    "torch_dtype": "float16"
  },
  "vision_end_token_id": 151653,
  "vision_start_token_id": 151652,
  "vision_token_id": 151654,
  "vocab_size": 151936
 }
--- a/configuration.json
+++ b/configuration.json
@@ -0,0 +1 @@
 {"framework": "pytorch", "task": "visual-question-answering", "allow_remote": true}
--- a/demonstration/exp.ipynb
+++ b/demonstration/exp.ipynb
@@ -0,0 +1,343 @@
 {
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-b4-SW1aGOcF"
      },
      "source": [
        "**Hoags-2B-Exp**\n",
        "\n",
        "Qwen2VLForConditionalGeneration"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oDmd1ZObGSel"
      },
      "outputs": [],
      "source": [
        "!pip install gradio spaces transformers accelerate numpy requests torch torchvision qwen-vl-utils av ipython reportlab fpdf python-docx pillow huggingface_hub"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ovBSsRFhGbs2"
      },
      "outputs": [],
      "source": [
        "# Authenticate with Hugging Face\n",
        "from huggingface_hub import login\n",
        "\n",
        "# Log in to Hugging Face using the provided token\n",
        "hf_token = '---xxxx-xxx-xxx---'\n",
        "login(hf_token)\n",
        "\n",
        "#Demo\n",
        "import gradio as gr\n",
        "import spaces\n",
        "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer\n",
        "from qwen_vl_utils import process_vision_info\n",
        "import torch\n",
        "from PIL import Image\n",
        "import os\n",
        "import uuid\n",
        "import io\n",
        "from threading import Thread\n",
        "from reportlab.lib.pagesizes import A4\n",
        "from reportlab.lib.styles import getSampleStyleSheet\n",
        "from reportlab.lib import colors\n",
        "from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer\n",
        "from reportlab.lib.units import inch\n",
        "from reportlab.pdfbase import pdfmetrics\n",
        "from reportlab.pdfbase.ttfonts import TTFont\n",
        "import docx\n",
        "from docx.enum.text import WD_ALIGN_PARAGRAPH\n",
        "\n",
        "# Define model options\n",
        "MODEL_OPTIONS = {\n",
        "    \"Hoags\": \"prithivMLmods/Hoags-2B-Exp\",\n",
        "}\n",
        "\n",
        "# Preload models and processors into CUDA\n",
        "models = {}\n",
        "processors = {}\n",
        "for name, model_id in MODEL_OPTIONS.items():\n",
        "    print(f\"Loading {name}...\")\n",
        "    models[name] = Qwen2VLForConditionalGeneration.from_pretrained(\n",
        "        model_id,\n",
        "        trust_remote_code=True,\n",
        "        torch_dtype=torch.float16\n",
        "    ).to(\"cuda\").eval()\n",
        "    processors[name] = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)\n",
        "\n",
        "image_extensions = Image.registered_extensions()\n",
        "\n",
        "def identify_and_save_blob(blob_path):\n",
        "    \"\"\"Identifies if the blob is an image and saves it.\"\"\"\n",
        "    try:\n",
        "        with open(blob_path, 'rb') as file:\n",
        "            blob_content = file.read()\n",
        "            try:\n",
        "                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image\n",
        "                extension = \".png\"  # Default to PNG for saving\n",
        "                media_type = \"image\"\n",
        "            except (IOError, SyntaxError):\n",
        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
        "\n",
        "            filename = f\"temp_{uuid.uuid4()}_media{extension}\"\n",
        "            with open(filename, \"wb\") as f:\n",
        "                f.write(blob_content)\n",
        "\n",
        "            return filename, media_type\n",
        "\n",
        "    except FileNotFoundError:\n",
        "        raise ValueError(f\"The file {blob_path} was not found.\")\n",
        "    except Exception as e:\n",
        "        raise ValueError(f\"An error occurred while processing the file: {e}\")\n",
        "\n",
        "@spaces.GPU\n",
        "def qwen_inference(model_name, media_input, text_input=None):\n",
        "    \"\"\"Handles inference for the selected model.\"\"\"\n",
        "    model = models[model_name]\n",
        "    processor = processors[model_name]\n",
        "\n",
        "    if isinstance(media_input, str):\n",
        "        media_path = media_input\n",
        "        if media_path.endswith(tuple([i for i in image_extensions.keys()])):\n",
        "            media_type = \"image\"\n",
        "        else:\n",
        "            try:\n",
        "                media_path, media_type = identify_and_save_blob(media_input)\n",
        "            except Exception as e:\n",
        "                raise ValueError(\"Unsupported media type. Please upload a valid image.\")\n",
        "\n",
        "    messages = [\n",
        "        {\n",
        "            \"role\": \"user\",\n",
        "            \"content\": [\n",
        "                {\n",
        "                    \"type\": media_type,\n",
        "                    media_type: media_path\n",
        "                },\n",
        "                {\"type\": \"text\", \"text\": text_input},\n",
        "            ],\n",
        "        }\n",
        "    ]\n",
        "\n",
        "    text = processor.apply_chat_template(\n",
        "        messages, tokenize=False, add_generation_prompt=True\n",
        "    )\n",
        "    image_inputs, _ = process_vision_info(messages)\n",
        "    inputs = processor(\n",
        "        text=[text],\n",
        "        images=image_inputs,\n",
        "        padding=True,\n",
        "        return_tensors=\"pt\",\n",
        "    ).to(\"cuda\")\n",
        "\n",
        "    streamer = TextIteratorStreamer(\n",
        "        processor.tokenizer, skip_prompt=True, skip_special_tokens=True\n",
        "    )\n",
        "    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)\n",
        "\n",
        "    thread = Thread(target=model.generate, kwargs=generation_kwargs)\n",
        "    thread.start()\n",
        "\n",
        "    buffer = \"\"\n",
        "    for new_text in streamer:\n",
        "        buffer += new_text\n",
        "        # Remove <|im_end|> or similar tokens from the output\n",
        "        buffer = buffer.replace(\"<|im_end|>\", \"\")\n",
        "        yield buffer\n",
        "\n",
        "def format_plain_text(output_text):\n",
        "    \"\"\"Formats the output text as plain text without LaTeX delimiters.\"\"\"\n",
        "    # Remove LaTeX delimiters and convert to plain text\n",
        "    plain_text = output_text.replace(\"\\\\(\", \"\").replace(\"\\\\)\", \"\").replace(\"\\\\[\", \"\").replace(\"\\\\]\", \"\")\n",
        "    return plain_text\n",
        "\n",
        "def generate_document(media_path, output_text, file_format, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a document with the input image and plain text output.\"\"\"\n",
        "    plain_text = format_plain_text(output_text)\n",
        "    if file_format == \"pdf\":\n",
        "        return generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
        "    elif file_format == \"docx\":\n",
        "        return generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size)\n",
        "\n",
        "def generate_pdf(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a PDF document.\"\"\"\n",
        "    filename = f\"output_{uuid.uuid4()}.pdf\"\n",
        "    doc = SimpleDocTemplate(\n",
        "        filename,\n",
        "        pagesize=A4,\n",
        "        rightMargin=inch,\n",
        "        leftMargin=inch,\n",
        "        topMargin=inch,\n",
        "        bottomMargin=inch\n",
        "    )\n",
        "    styles = getSampleStyleSheet()\n",
        "    styles[\"Normal\"].fontSize = int(font_size)\n",
        "    styles[\"Normal\"].leading = int(font_size) * line_spacing\n",
        "    styles[\"Normal\"].alignment = {\n",
        "        \"Left\": 0,\n",
        "        \"Center\": 1,\n",
        "        \"Right\": 2,\n",
        "        \"Justified\": 4\n",
        "    }[alignment]\n",
        "\n",
        "    story = []\n",
        "\n",
        "    # Add image with size adjustment\n",
        "    image_sizes = {\n",
        "        \"Small\": (200, 200),\n",
        "        \"Medium\": (400, 400),\n",
        "        \"Large\": (600, 600)\n",
        "    }\n",
        "    img = RLImage(media_path, width=image_sizes[image_size][0], height=image_sizes[image_size][1])\n",
        "    story.append(img)\n",
        "    story.append(Spacer(1, 12))\n",
        "\n",
        "    # Add plain text output\n",
        "    text = Paragraph(plain_text, styles[\"Normal\"])\n",
        "    story.append(text)\n",
        "\n",
        "    doc.build(story)\n",
        "    return filename\n",
        "\n",
        "def generate_docx(media_path, plain_text, font_size, line_spacing, alignment, image_size):\n",
        "    \"\"\"Generates a DOCX document.\"\"\"\n",
        "    filename = f\"output_{uuid.uuid4()}.docx\"\n",
        "    doc = docx.Document()\n",
        "\n",
        "    # Add image with size adjustment\n",
        "    image_sizes = {\n",
        "        \"Small\": docx.shared.Inches(2),\n",
        "        \"Medium\": docx.shared.Inches(4),\n",
        "        \"Large\": docx.shared.Inches(6)\n",
        "    }\n",
        "    doc.add_picture(media_path, width=image_sizes[image_size])\n",
        "    doc.add_paragraph()\n",
        "\n",
        "    # Add plain text output\n",
        "    paragraph = doc.add_paragraph()\n",
        "    paragraph.paragraph_format.line_spacing = line_spacing\n",
        "    paragraph.paragraph_format.alignment = {\n",
        "        \"Left\": WD_ALIGN_PARAGRAPH.LEFT,\n",
        "        \"Center\": WD_ALIGN_PARAGRAPH.CENTER,\n",
        "        \"Right\": WD_ALIGN_PARAGRAPH.RIGHT,\n",
        "        \"Justified\": WD_ALIGN_PARAGRAPH.JUSTIFY\n",
        "    }[alignment]\n",
        "    run = paragraph.add_run(plain_text)\n",
        "    run.font.size = docx.shared.Pt(int(font_size))\n",
        "\n",
        "    doc.save(filename)\n",
        "    return filename\n",
        "\n",
        "# CSS for output styling\n",
        "css = \"\"\"\n",
        "  #output {\n",
        "    height: 500px;\n",
        "    overflow: auto;\n",
        "    border: 1px solid #ccc;\n",
        "  }\n",
        ".submit-btn {\n",
        "    background-color: #cf3434  !important;\n",
        "    color: white !important;\n",
        "}\n",
        ".submit-btn:hover {\n",
        "    background-color: #ff2323 !important;\n",
        "}\n",
        ".download-btn {\n",
        "    background-color: #35a6d6 !important;\n",
        "    color: white !important;\n",
        "}\n",
        ".download-btn:hover {\n",
        "    background-color: #22bcff !important;\n",
        "}\n",
        "\"\"\"\n",
        "\n",
        "# Gradio app setup\n",
        "with gr.Blocks(css=css) as demo:\n",
        "    gr.Markdown(\"# Hoags-2B-Exp\")\n",
        "\n",
        "    with gr.Tab(label=\"Image Input\"):\n",
        "\n",
        "        with gr.Row():\n",
        "            with gr.Column():\n",
        "                model_choice = gr.Dropdown(\n",
        "                    label=\"Model Selection\",\n",
        "                    choices=list(MODEL_OPTIONS.keys()),\n",
        "                    value=\"Hoags\"\n",
        "                )\n",
        "                input_media = gr.File(\n",
        "                    label=\"Upload Image\", type=\"filepath\"\n",
        "                )\n",
        "                text_input = gr.Textbox(label=\"Question\", placeholder=\"Ask a question about the image...\")\n",
        "                submit_btn = gr.Button(value=\"Submit\", elem_classes=\"submit-btn\")\n",
        "\n",
        "            with gr.Column():\n",
        "                output_text = gr.Textbox(label=\"Output Text\", lines=10)\n",
        "                plain_text_output = gr.Textbox(label=\"Standardized Plain Text\", lines=10)\n",
        "\n",
        "        submit_btn.click(\n",
        "            qwen_inference, [model_choice, input_media, text_input], [output_text]\n",
        "        ).then(\n",
        "            lambda output_text: format_plain_text(output_text), [output_text], [plain_text_output]\n",
        "        )\n",
        "\n",
        "        # Add examples directly usable by clicking\n",
        "        with gr.Row():\n",
        "            with gr.Column():\n",
        "                line_spacing = gr.Dropdown(\n",
        "                    choices=[0.5, 1.0, 1.15, 1.5, 2.0, 2.5, 3.0],\n",
        "                    value=1.5,\n",
        "                    label=\"Line Spacing\"\n",
        "                )\n",
        "                font_size = gr.Dropdown(\n",
        "                    choices=[\"8\", \"10\", \"12\", \"14\", \"16\", \"18\", \"20\", \"22\", \"24\"],\n",
        "                    value=\"18\",\n",
        "                    label=\"Font Size\"\n",
        "                )\n",
        "                alignment = gr.Dropdown(\n",
        "                    choices=[\"Left\", \"Center\", \"Right\", \"Justified\"],\n",
        "                    value=\"Justified\",\n",
        "                    label=\"Text Alignment\"\n",
        "                )\n",
        "                image_size = gr.Dropdown(\n",
        "                    choices=[\"Small\", \"Medium\", \"Large\"],\n",
        "                    value=\"Small\",\n",
        "                    label=\"Image Size\"\n",
        "                )\n",
        "                file_format = gr.Radio([\"pdf\", \"docx\"], label=\"File Format\", value=\"pdf\")\n",
        "                get_document_btn = gr.Button(value=\"Get Document\", elem_classes=\"download-btn\")\n",
        "\n",
        "        get_document_btn.click(\n",
        "            generate_document, [input_media, output_text, file_format, font_size, line_spacing, alignment, image_size], gr.File(label=\"Download Document\")\n",
        "        )\n",
        "\n",
        "demo.launch(debug=True)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,14 @@
 {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "max_length": 32768,
  "pad_token_id": 151654,
  "temperature": 0.01,
  "top_k": 1,
  "top_p": 0.001,
  "transformers_version": "4.49.0.dev0"
 }
--- a/merges.txt
+++ b/merges.txt
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:feecafa106f20ec6d3f182b6f292f32b8b94c76dccf267ab1d38e721beb56619
 size 4418049776
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@@ -0,0 +1,29 @@
 {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Qwen2VLImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "max_pixels": 12845056,
  "merge_size": 2,
  "min_pixels": 3136,
  "patch_size": 14,
  "processor_class": "Qwen2VLProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 12845056,
    "shortest_edge": 3136
  },
  "temporal_patch_size": 2
 }
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,31 @@
 {
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|vision_pad|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:948c45c29a91dd2e6ae77d6f5a324a3d408bcca6ad443365b2e79986f1422771
 size 11420540
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,145 @@
 {
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "model_max_length": 32768,
  "pad_token": "<|vision_pad|>",
  "padding_side": "right",
  "processor_class": "Qwen2VLProcessor",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/vocab.json
+++ b/vocab.json
		`@@ -0,0 +1 @@`
							`{"framework": "pytorch", "task": "visual-question-answering", "allow_remote": true}`