初始化项目，由ModelHub XC社区提供模型

Model: rednote-hilab/dots.mocr Source: Original Platform
2026-05-22 18:07:20 +08:00
commit 8e91841ff8
22 changed files with 458199 additions and 0 deletions
--- a/.eval_results/mdpbench.yaml
+++ b/.eval_results/mdpbench.yaml
@@ -0,0 +1,198 @@
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: overall
+  value: 80.5
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: digital
+  value: 90.5
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: photographed
+  value: 77.2
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: latin
+  value: 81.7
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: de
+  value: 82.6
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: en
+  value: 87.4
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: es
+  value: 71.3
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: fr
+  value: 70.1
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: id
+  value: 84.5
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: it
+  value: 89.3
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: nl
+  value: 83.2
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: pt
+  value: 86.8
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: vi
+  value: 79.9
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: non_latin
+  value: 79.2
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: ar
+  value: 83.3
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: hi
+  value: 83.6
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: jp
+  value: 75.0
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: ko
+  value: 78.7
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: ru
+  value: 71.2
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: th
+  value: 77.9
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: zh
+  value: 84.6
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
+- dataset:
+    id: Delores-Lin/MDPBench
+    task_id: zh_t
+  value: 79.6
+  date: "2026-04-14"
+  source:
+    url: https://huggingface.co/datasets/Delores-Lin/MDPBench
+    name: MDPBench leaderboard
+    user: Delores-Lin
--- a/.eval_results/olmocrbench.yaml
+++ b/.eval_results/olmocrbench.yaml
@@ -0,0 +1,72 @@
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: overall
+  value: 83.9
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: arxiv_math
+  value: 85.9
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: old_scans_math
+  value: 85.5
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: table_tests
+  value: 90.7
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: old_scans
+  value: 48.2
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: multi_column
+  value: 85.3
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: long_tiny_text
+  value: 81.6
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: headers_footers
+  value: 94.0
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: baseline
+  value: 99.7
+  source:
+    url: https://huggingface.co/papers/2603.13032
+    name: dots.mocr technical report
+    user: nielsr
--- a/.eval_results/parsebench.yaml
+++ b/.eval_results/parsebench.yaml
@@ -0,0 +1,60 @@
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: mean
+  value: 55.8
+  date: '2026-03-26'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: text_content
+  value: 90.0
+  date: '2026-03-26'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: text_formatting
+  value: 47.0
+  date: '2026-03-26'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: layout
+  value: 55.8
+  date: '2026-04-09'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: chart
+  value: 0.9
+  date: '2026-04-08'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
+- dataset:
+    id: llamaindex/ParseBench
+    task_id: table
+  value: 85.2
+  date: '2026-04-08'
+  source:
+    url: https://huggingface.co/datasets/llamaindex/ParseBench
+    name: ParseBench
+    user: boyang-runllama
+  notes: "Pipeline name: dots_ocr_1_5_parse"
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/289
+++ b/289
@@ -0,0 +1,289 @@
+==================================================================
+=============== Copyright Notice and License Texts ===============
+==================================================================
+
+
+------------- LICENSE FOR gradio CODE --------------
+ 
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+    "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+    "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+    "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition,
+      
+"control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+    "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+    "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+    "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+    "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+    "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+    "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+    (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
+
+    (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
+
+    (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+
+    (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+
+    You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions.Not withstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+    To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
+
+
+
+------------- LICENSE FOR openai CODE --------------
+
+Copyright notice:Copyright 2025 OpenAI
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR qwen_vl_utils CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR transformers CODE --------------
+
+Copyright notice:Copyright 2018- The Hugging Face team. All rights reserved.
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR huggingface_hub CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR flash-attn CODE --------------
+
+Copyright notice:Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
+
+License:BSD-3-Clause license
+
+BSD 3-Clause License
+
+Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this list ofconditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISEDOF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+------------- LICENSE FOR accelerate CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR OmniDocbench CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR Qwen2.5-VL CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR Hugging Face CODE --------------
+
+Copyright notice：Copyright 2019 Ross Wightman
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR vLLM CODE --------------
+
+Copyright notice:No copyright info provided
+
+License:apache2.0
+
+Please see above.
+
+
+
+------------- LICENSE FOR Doclaynet --------------
+
+Copyright notice:No copyright info provided
+
+License:Community Data License Agreement
+
+Community Data License Agreement – Permissive – Version 1.0
+
+This is the Community Data License Agreement – Permissive, Version 1.0 (“Agreement”). Data is provided to You under this Agreement by each of the Data Providers. Your exercise of any of the rights and permissions granted below constitutes your acceptance and agreement to be bound by the terms and conditions of this Agreement.
+
+The benefits that each Data Provider receives from making Data available and that You receive from Data or otherwise under these terms and conditions shall be deemed sufficient consideration for the formation of this Agreement. Accordingly, Data Provider(s) and You (the "Parties") agree as follows:
+
+Section 1.  Definitions
+
+1.1 "Add" means to supplement Data with Your own or someone else's Data, resulting in Your “Additions.” Additions do not include Results.
+
+1.2 "Computational Use" means Your analysis (through the use of computational devices or otherwise) or other interpretation of Data. By way of example and not limitation, "Computational Use" includes the application of any computational analytical technique, the purpose of which is the analysis of any Data in digital form to generate information about Data such as patterns, trends, correlations, inferences, insights and attributes.
+
+1.3 "Data" means the information (including copyrightable information, such as images or text), collectively or individually, whether created or gathered by a Data Provider or an Entity acting on its behalf, to which rights are granted under this Agreement.
+
+1.4 "Data Provider" means any Entity (including any employee or contractor of such Entity authorized to Publish Data on behalf of such Entity) that Publishes Data under this Agreement prior to Your Receiving it.
+
+1.5 "Enhanced Data" means the subset of Data that You Publish and that is composed of (a) Your Additions and/or (b) Modifications to Data You have received under this Agreement.
+
+1.6 "Entity" means any natural person or organization that exists under the laws of the jurisdiction in which it is organized, together with all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, directly or indirectly, to cause the direction or management of such entity, whether by contract or otherwise, (b) the ownership of more than fifty percent (50%) of the outstanding shares or securities, (c) the beneficial ownership of such entity or, (d) the ability to appoint, whether by agreement or right, the majority of directors of an Entity.
+
+1.7 "Modify" means to delete, erase, correct or re-arrange Data, resulting in “Modifications.” Modifications do not include Results.
+
+1.8 "Publish" means to make all or a subset of Data (including Your Enhanced Data) available in any manner which enables its use, including by providing a copy on physical media or remote access. For any form of Entity, that is to make the Data available to any individual who is not employed by that Entity or engaged as a contractor or agent to perform work on that Entity's behalf. A "Publication" occurs each time you Publish Data.
+
+1.9 "Receive" or "Receives" means to have been given access to Data, locally or remotely.
+
+1.10 "Results" means the outcomes or outputs that You obtain from Your Computational Use of Data. Results shall not include more than a de minimis portion of the Data on which the Computational Use is based.
+
+1.11 "Sui Generis Database Rights" means rights, other than copyright, resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other equivalent rights anywhere in the world.
+
+1.12 "Use" means using Data (including accessing, copying, studying, reviewing, adapting, analyzing, evaluating, or making Computational Use of it), either by machines or humans, or a combination of both.
+
+1.13 "You" or "Your" means any Entity that Receives Data under this Agreement.
+
+Section 2. Right and License to Use and to Publish
+
+2.1 Subject to the conditions set forth in Section 3 of this Agreement, Data Provider(s) hereby grant(s) to You a worldwide, non-exclusive, irrevocable (except as provided in Section 5) right to: (a) Use Data; and (b) Publish Data.
+
+2.2 To the extent that the Data or the coordination, selection or arrangement of Data is protected or protectable under copyright, Sui Generis Database Rights, or other law, Data Provider(s) further agree(s) that such Data or coordination, selection or arrangement is hereby licensed to You and to anyone else who Receives Data under this Agreement for Use and Publication, subject to the conditions set forth in Section 3 of this Agreement.
+
+2.3 Except for these rights and licenses expressly granted, no other intellectual property rights are granted or should be implied.
+
+Section 3. Conditions on Rights Granted
+
+3.1 If You Publish Data You Receive or Enhanced Data:
+
+(a) You may do so under a license of your choice provided that you give anyone who receives the data from you the text of this Agreement, the name of this Agreement and/or a hyperlink or other method reasonably likely to provide a copy of the text of this Agreement; and
+
+(b) You must cause any Data files containing Enhanced Data to carry prominent notices that you have changed those files; and
+
+(c) If You Publish Data You Receive, You must preserve all credit or attribution to the Data Provider(s). Such retained credit or attribution includes any of the following to the extent they exist in the Data as You have Received it: legal notices or metadata; identification of the Data Provider(s); or hyperlinks to Data to the extent it is practical to do so.
+
+3.2 You may provide additional or different license terms and conditions for use, reproduction, or distribution of that Enhanced Data, or for any combination of Data and Enhanced Data as a whole, provided that Your Use and Publication of that combined Data otherwise complies with the conditions stated in this License.
+
+3.3 You and each Data Provider agree that Enhanced Data shall not be considered a work of joint authorship by virtue of its relationship to Data licensed under this Agreement and shall not require either any obligation of accounting to or the consent of any Data Provider.
+
+3.4 This Agreement imposes no obligations or restrictions on Your Use or Publication of Results.
+
+Section 4. Data Provider(s)' Representations
+
+4.1 Each Data Provider represents that the Data Provider has exercised reasonable care, to assure that: (a) the Data it Publishes was created or generated by it or was obtained from others with the right to Publish the Data under this Agreement; and (b) Publication of such Data does not violate any privacy or confidentiality obligation undertaken by the Data Provider.
+
+Section 5.  Termination
+
+5.1 All of Your rights under this Agreement will terminate, and Your right to Receive, Use or Publish the Data will be revoked or modified if You materially fail to comply with the terms and conditions of this Agreement and You do not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If your rights under this Agreement terminate, you agree to cease Receipt, Use and Publication of Data. However, your obligations and any rights and permissions granted by you under this Agreement relating to Data that you published prior to such termination will continue and survive.
+
+5.2 If you institute litigation against a Data Provider or anyone else who Receives the Data (including a cross-claim in a lawsuit) based on the Data, other than a claim asserting breach of this Agreement, then any rights previously granted to You to Receive, Use and Publish Data under this Agreement will terminate as of the date such litigation is filed.
+
+Section 6. Disclaimer of Warranties and Limitation of Liability
+
+6.1 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE DATA (INCLUDING ENHANCED DATA) IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+6.2 NEITHER YOU NOR ANY DATA PROVIDERS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE DATA OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+Section 7.  Miscellaneous
+
+7.1 You agree that it is solely your responsibility to comply with all applicable laws with regard to Your Use or Publication of Data, including any applicable privacy, data protection, security and export laws. You agree to take reasonable steps to assist a Data Provider fulfilling responsibilities to comply with applicable laws with regard to Use or Publication of Data Received hereunder.
+
+7.2 You and Data Provider(s), collectively and individually, waive and/or agree not to assert, to the extent permitted by law, any moral rights you or they hold in Data.
+
+7.3 This Agreement confers no rights or remedies upon any person or entity other than the Parties and their respective heirs, executors, successors and assigns.
+
+7.4 The Data Provider(s) reserve no right or expectation of privacy, data protection or confidentiality in any Data that they Publish under this Agreement. If you choose to Publish Data under this Agreement, you similarly do so with no reservation or expectation of any rights of privacy or confidentiality in that Data.
+
+7.5 The Community Data License Agreement workgroup under The Linux Foundation is the steward of this Agreement (“Steward”). No one other than the Steward has the right to modify or publish new versions of this Agreement. Each version will be given a distinguishing version number. You may Use and Publish Data Received hereunder under the terms of the version of the Agreement under which You originally Received the Data, or under the terms of any subsequent version published by the Steward.
+
--- a/README.md
+++ b/README.md
@@ -0,0 +1,788 @@
+---
+license: mit
+library_name: dots_mocr
+pipeline_tag: image-text-to-text
+tags:
+- image-to-text
+- ocr
+- document-parse
+- layout
+- table
+- formula
+- transformers
+- custom_code
+language:
+- en
+- zh
+- multilingual
+---
+
+<div align="center">
+
+<h1 align="center">
+dots.mocr
+</h1>
+
+[![HuggingFace](https://img.shields.io/badge/HuggingFace%20Weights-black.svg?logo=HuggingFace)](https://huggingface.co/rednote-hilab/dots.mocr)
+[![GitHub](https://img.shields.io/badge/GitHub-Repository-black?logo=github)](https://github.com/rednote-hilab/dots.mocr)
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-b31b1b.svg?logo=arxiv)](https://arxiv.org/abs/2603.13032v1)
+
+
+<div align="center">
+  <a href="https://dotsocr.xiaohongshu.com" target="_blank" rel="noopener noreferrer"><strong>🖥️ Live Demo</strong></a> | 
+  <a href="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/wechat.jpg" target="_blank" rel="noopener noreferrer"><strong>💬 WeChat</strong></a> | 
+  <a href="https://www.xiaohongshu.com/user/profile/683ffe42000000001d021a4c" target="_blank" rel="noopener noreferrer"><strong>📕 rednote</strong></a> | 
+  <a href="https://x.com/rednotehilab" target="_blank" rel="noopener noreferrer"><strong>🐦 X</strong></a>
+</div>
+
+</div>
+
+
+## Introduction
+
+We present [dots.mocr](https://huggingface.co/rednote-hilab/dots.mocr). Beyond achieving state-of-the-art (SOTA) performance in standard multilingual document parsing among models of comparable size, **dots.mocr** excels at converting structured graphics (e.g., charts, UI layouts, scientific figures and etc.) directly into SVG code. Its core capabilities encompass grounding, recognition, semantic understanding, and interactive dialogue.
+
+Simultaneously, we are releasing [dots.mocr-svg](https://huggingface.co/rednote-hilab/dots.mocr-svg), a variant specifically optimized for robust image-to-SVG parsing tasks. 
+
+More information can be found [in the paper](https://arxiv.org/abs/2603.13032v1).
+
+
+## Evaluation
+
+### 1. Document Parsing
+
+#### 1.1 Elo Score of different bench between latest models
+
+<table>
+  <thead>
+    <tr>
+      <th>models</th>
+      <th>olmOCR-Bench</th>
+      <th>OmniDocBench (v1.5)</th>
+      <th>XDocParse</th>
+      <th>Average</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>MonkeyOCR-pro-3B</td>
+      <td>895.0</td>
+      <td>811.3</td>
+      <td>637.1</td>
+      <td>781.1</td>
+    </tr>
+    <tr>
+      <td>GLM-OCR</td>
+      <td>884.2</td>
+      <td>972.6</td>
+      <td>820.7</td>
+      <td>892.5</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL-1.5</td>
+      <td>897.3</td>
+      <td>997.9</td>
+      <td>866.4</td>
+      <td>920.5</td>
+    </tr>
+    <tr>
+      <td>HuanyuanOCR</td>
+      <td>997.6</td>
+      <td>1003.9</td>
+      <td>951.1</td>
+      <td>984.2</td>
+    </tr>
+    <tr>
+      <td>dots.ocr</td>
+      <td>1041.1</td>
+      <td>1027.2</td>
+      <td>1190.3</td>
+      <td>1086.2</td>
+    </tr>
+    <!-- Highlighting dots.mocr row with bold tags -->
+    <tr>
+      <td><strong>dots.mocr</strong></td>
+      <td><strong>1104.4</strong></td>
+      <td><strong>1059.0</strong></td>
+      <td><strong>1210.7</strong></td>
+      <td><strong>1124.7</strong></td>
+    </tr>
+    <tr>
+      <td>Gemini 3 Pro</td>
+      <td>1180.4</td>
+      <td>1128.0</td>
+      <td>1323.7</td>
+      <td>1210.7</td>
+    </tr>
+  </tbody>
+</table>
+
+
+> **Notes:** 
+> - Results for Gemini 3 Pro, PaddleOCR-VL-1.5, and GLM-OCR were obtained via APIs, while HuanyuanOCR results were generated using local inference.
+> - The Elo score evaluation was conducted using Gemini 3 Flash. The prompt can be found at: [Elo Score Prompt](https://github.com/rednote-hilab/dots.mocr/blob/master/tools/elo_score_prompt.py). These results are consistent with the findings on [ocrarena](https://www.ocrarena.ai/battle).
+
+
+#### 1.2 olmOCR-bench
+<table>
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th>ArXiv</th>
+            <th>Old scans math</th>
+            <th>Tables</th>
+            <th>Old scans</th>
+            <th>Headers & footers</th>
+            <th>Multi column</th>
+            <th>Long tiny text</th>
+            <th>Base</th>
+            <th>Overall</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>Mistral OCR API</td>
+            <td>77.2</td>
+            <td>67.5</td>
+            <td>60.6</td>
+            <td>29.3</td>
+            <td>93.6</td>
+            <td>71.3</td>
+            <td>77.1</td>
+            <td>99.4</td>
+            <td>72.0±1.1</td>
+        </tr>
+        <tr>
+            <td>Marker 1.10.1</td>
+            <td>83.8</td>
+            <td>66.8</td>
+            <td>72.9</td>
+            <td>33.5</td>
+            <td>86.6</td>
+            <td>80.0</td>
+            <td>85.7</td>
+            <td>99.3</td>
+            <td>76.1±1.1</td>
+        </tr>
+        <tr>
+            <td>MinerU 2.5.4*</td>
+            <td>76.6</td>
+            <td>54.6</td>
+            <td>84.9</td>
+            <td>33.7</td>
+            <td>96.6</td>
+            <td>78.2</td>
+            <td>83.5</td>
+            <td>93.7</td>
+            <td>75.2±1.1</td>
+        </tr>
+        <tr>
+            <td>DeepSeek-OCR</td>
+            <td>77.2</td>
+            <td>73.6</td>
+            <td>80.2</td>
+            <td>33.3</td>
+            <td>96.1</td>
+            <td>66.4</td>
+            <td>79.4</td>
+            <td>99.8</td>
+            <td>75.7±1.0</td>
+        </tr>
+        <tr>
+            <td>Nanonets-OCR2-3B</td>
+            <td>75.4</td>
+            <td>46.1</td>
+            <td>86.8</td>
+            <td>40.9</td>
+            <td>32.1</td>
+            <td>81.9</td>
+            <td>93.0</td>
+            <td>99.6</td>
+            <td>69.5±1.1</td>
+        </tr>
+        <tr>
+            <td>PaddleOCR-VL*</td>
+            <td>85.7</td>
+            <td>71.0</td>
+            <td>84.1</td>
+            <td>37.8</td>
+            <td>97.0</td>
+            <td>79.9</td>
+            <td>85.7</td>
+            <td>98.5</td>
+            <td>80.0±1.0</td>
+        </tr>
+        <tr>
+            <td>Infinity-Parser 7B*</td>
+            <td>84.4</td>
+            <td>83.8</td>
+            <td>85.0</td>
+            <td>47.9</td>
+            <td>88.7</td>
+            <td>84.2</td>
+            <td>86.4</td>
+            <td>99.8</td>
+            <td>82.5±?</td>
+        </tr>
+        <tr>
+            <td>olmOCR v0.4.0</td>
+            <td>83.0</td>
+            <td>82.3</td>
+            <td>84.9</td>
+            <td>47.7</td>
+            <td>96.1</td>
+            <td>83.7</td>
+            <td>81.9</td>
+            <td>99.7</td>
+            <td>82.4±1.1</td>
+        </tr>
+        <tr>
+            <td>Chandra OCR 0.1.0*</td>
+            <td>82.2</td>
+            <td>80.3</td>
+            <td>88.0</td>
+            <td>50.4</td>
+            <td>90.8</td>
+            <td>81.2</td>
+            <td>92.3</td>
+            <td>99.9</td>
+            <td>83.1±0.9</td>
+        </tr>
+        <tr>
+            <td>dots.ocr</td>
+            <td>82.1</td>
+            <td>64.2</td>
+            <td>88.3</td>
+            <td>40.9</td>
+            <td>94.1</td>
+            <td>82.4</td>
+            <td>81.2</td>
+            <td>99.5</td>
+            <td>79.1±1.0</td>
+        </tr>
+        <tr>
+            <td><strong>dots.mocr</strong></td>
+            <td><strong>85.9</strong></td>
+            <td><strong>85.5</strong></td>
+            <td><strong>90.7</strong></td>
+            <td>48.2</td>
+            <td>94.0</td>
+            <td><strong>85.3</strong></td>
+            <td>81.6</td>
+            <td>99.7</td>
+            <td><strong>83.9±0.9</strong></td>
+        </tr>
+    </tbody>
+</table>
+
+
+> **Note:**
+> - The metrics are from [olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
+> - We delete the Page-header and Page-footer cells in the result markdown.
+
+
+#### 1.3 Other Benchmarks
+
+<table>
+  <thead>
+    <tr>
+      <th>Model Type</th>
+      <th>Methods</th>
+      <th>Size</th>
+      <th>OmniDocBench(v1.5)<br>TextEdit↓</th>
+      <th>OmniDocBench(v1.5)<br>Read OrderEdit↓</th>
+      <th>pdf-parse-bench</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- GeneralVLMs Group (Reversed Order, 3 rows) -->
+    <tr>
+      <td rowspan="3"><strong>GeneralVLMs</strong></td>
+      <td>Gemini-2.5 Pro</td>
+      <td>-</td>
+      <td>0.075</td>
+      <td>0.097</td>
+      <td>9.06</td>
+    </tr>
+    <tr>
+      <td>Qwen3-VL-235B-A22B-Instruct</td>
+      <td>235B</td>
+      <td>0.069</td>
+      <td>0.068</td>
+      <td><strong>9.71</strong></td>
+    </tr>
+    <tr>
+      <td>gemini3pro</td>
+      <td>-</td>
+      <td>0.066</td>
+      <td>0.079</td>
+      <td>9.68</td>
+    </tr>
+    <!-- SpecializedVLMs Group (Reversed Order, 12 rows) -->
+    <tr>
+      <td rowspan="12"><strong>SpecializedVLMs</strong></td>
+      <td>Mistral OCR</td>
+      <td>-</td>
+      <td>0.164</td>
+      <td>0.144</td>
+      <td>8.84</td>
+    </tr>
+    <tr>
+      <td>Deepseek-OCR</td>
+      <td>3B</td>
+      <td>0.073</td>
+      <td>0.086</td>
+      <td>8.26</td>
+    </tr>
+    <tr>
+      <td>MonkeyOCR-3B</td>
+      <td>3B</td>
+      <td>0.075</td>
+      <td>0.129</td>
+      <td>9.27</td>
+    </tr>
+    <tr>
+      <td>OCRVerse</td>
+      <td>4B</td>
+      <td>0.058</td>
+      <td>0.071</td>
+      <td>--</td>
+    </tr>
+    <tr>
+      <td>MonkeyOCR-pro-3B</td>
+      <td>3B</td>
+      <td>0.075</td>
+      <td>0.128</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>MinerU2.5</td>
+      <td>1.2B</td>
+      <td>0.047</td>
+      <td>0.044</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL</td>
+      <td>0.9B</td>
+      <td>0.035</td>
+      <td>0.043</td>
+      <td>9.51</td>
+    </tr>
+    <tr>
+      <td>HunyuanOCR</td>
+      <td>0.9B</td>
+      <td>0.042</td>
+      <td>-</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>PaddleOCR-VL1.5</td>
+      <td>0.9B</td>
+      <td>0.035</td>
+      <td>0.042</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>GLMOCR</td>
+      <td>0.9B</td>
+      <td>0.04</td>
+      <td>0.043</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>dots.ocr</td>
+      <td>3B</td>
+      <td>0.048</td>
+      <td>0.053</td>
+      <td>9.29</td>
+    </tr>
+    <tr>
+      <td><u><strong>dots.mocr</strong></u></td>
+      <td>3B</td>
+      <td><strong>0.031</strong></td>
+      <td><strong>0.029</strong></td>
+      <td>9.54</td>
+    </tr>
+  </tbody>
+</table>
+
+> **Note:**
+> - Metrics are sourced from [OmniDocBench](https://github.com/opendatalab/OmniDocBench) and other model publications. [pdf-parse-bench](https://github.com/phorn1/pdf-parse-bench) results are reproduced by Qwen3-VL-235B-A22B-Instruct.
+> - Formula and Table metrics for OmniDocBench1.5 are omitted due to their high sensitivity to detection and matching protocols.
+
+
+### 2. Structured Graphics Parsing
+Visual languages (e.g., charts, graphics, chemical formulas, logos) encapsulate dense human knowledge. **dots.mocr** unifies the interpretation of these elements by parsing them directly into **SVG code**.
+
+<table>
+  <thead>
+    <tr>
+      <th rowspan="2" style="text-align: left;">Methods</th>
+      <th colspan="3">Unisvg</th>
+      <th rowspan="2">Chartmimic</th>
+      <th rowspan="2">Design2Code</th>
+      <th rowspan="2">Genexam</th>
+      <th rowspan="2">SciGen</th>
+      <th rowspan="2">ChemDraw</th>
+    </tr>
+    <tr>
+      <th>Low-Level</th>
+      <th>High-Level</th>
+      <th>Score</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left;">OCRVerse</td>
+      <td>0.632</td>
+      <td>0.852</td>
+      <td>0.763</td>
+      <td>0.799</td>
+      <td>-</td>
+      <td>-</td>
+      <td>-</td>
+      <td>0.881</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;">Gemini 3 Pro</td>
+      <td>0.563</td>
+      <td>0.850</td>
+      <td>0.735</td>
+      <td>0.788</td>
+      <td>0.760</td>
+      <td>0.756</td>
+      <td>0.783</td>
+      <td>0.839</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;">dots.mocr</td>
+      <td>0.850</td>
+      <td>0.923</td>
+      <td>0.894</td>
+      <td>0.772</td>
+      <td>0.801</td>
+      <td>0.664</td>
+      <td>0.660</td>
+      <td>0.790</td>
+    </tr>
+    <tr>
+      <td style="text-align: left;"><strong>dots.mocr-svg</strong></td>
+      <td><strong>0.860</strong></td>
+      <td><strong>0.931</strong></td>
+      <td><strong>0.902</strong></td>
+      <td><strong>0.905</strong></td>
+      <td><strong>0.834</strong></td>
+      <td><strong>0.8</strong></td>
+      <td><strong>0.797</strong></td>
+      <td><strong>0.901</strong></td>
+    </tr>
+  </tbody>
+</table>
+
+
+> **Note:**
+> - We use the ISVGEN metric from [UniSVG](https://ryanlijinke.github.io/) to evaluate the parsing result. For benchmarks that do not natively support image parsing, we use the original images as input, and calculate the ISVGEN score between the rendered output and the original image. 
+> - [OCRVerse](https://github.com/DocTron-hub/OCRVerse) results are derived from various code formats (e.g., SVG, Python), whereas results for Gemini 3 Pro and dots.mocr are based specifically on SVG code.
+> - Due to the capacity constraints of a 3B-parameter VLM, dots.mocr may not excel in all tasks yet like svg. To complement this, we are simultaneously releasing dots.mocr-svg. We plan to further address these limitations in future updates.
+
+
+### 3. General Vision Tasks
+
+<table>
+    <thead>
+        <tr>
+            <th>Model</th>
+            <th>CharXiv_descriptive</th>
+            <th>CharXiv_reasoning</th>
+            <th>OCR_Reasoning</th>
+            <th>infovqa</th>
+            <th>docvqa</th>
+            <th>ChartQA</th>
+            <th>OCRBench</th>
+            <th>AI2D</th>
+            <th>CountBenchQA</th>
+            <th>refcoco</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>Qwen3vl-2b-instruct</td>
+            <td>62.3</td>
+            <td>26.8</td>
+            <td>-</td>
+            <td>72.4</td>
+            <td>93.3</td>
+            <td>-</td>
+            <td>85.8</td>
+            <td>76.9</td>
+            <td>88.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td>Qwen3vl-4b-instruct</td>
+            <td>76.2</td>
+            <td>39.7</td>
+            <td>-</td>
+            <td>80.3</td>
+            <td>95.3</td>
+            <td>-</td>
+            <td>88.1</td>
+            <td>84.1</td>
+            <td>84.9</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td><strong>dots.mocr</strong></td>
+            <td>77.4</td>
+            <td>55.3</td>
+            <td>22.85</td>
+            <td>73.76</td>
+            <td>91.85</td>
+            <td>83.2</td>
+            <td>86.0</td>
+            <td>82.16</td>
+            <td>94.46</td>
+            <td>80.03</td>
+        </tr>
+    </tbody>
+</table>
+
+
+
+# Quick Start
+## 1. Installation
+### Install dots.mocr
+```shell
+conda create -n dots_mocr python=3.12
+conda activate dots_mocr
+
+git clone https://github.com/rednote-hilab/dots.mocr.git
+cd dots.mocr
+
+# Install pytorch, see https://pytorch.org/get-started/previous-versions/ for your cuda version
+# pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
+# install flash-attn==2.8.0.post2 for faster inference
+pip install -e .
+```
+
+If you have trouble with the installation, try our [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) for an easier setup, and follow these steps:
+
+
+### Download Model Weights
+> 💡**Note:** Please use a directory name without periods (e.g., `DotsMOCR` instead of `dots.mocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
+```shell
+python3 tools/download_model.py
+
+# with modelscope
+python3 tools/download_model.py --type modelscope
+```
+
+
+## 2. Deployment
+### vLLM inference
+We highly recommend using vLLM for deployment and inference. **Since vLLM version 0.11.0, Dots OCR has been officially integrated into vLLM with verified performance** and you can use vLLM docker image directly (e.g, `vllm/vllm-openai:v0.11.0`) to deploy the model server.
+
+```shell
+# Launch vLLM model server
+## dots.mocr
+CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.mocr --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
+
+## dots.mocr-svg
+CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.mocr-svg --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
+
+# vLLM API Demo
+# See dots_mocr/model/inference.py and dots_mocr/utils/prompts.py for details on parameter and prompt settings 
+# that help achieve the best output quality.
+## document parsing
+python3 ./demo/demo_vllm.py --prompt_mode prompt_layout_all_en 
+## web parsing 
+python3 ./demo/demo_vllm.py --prompt_mode prompt_web_parsing --image_path ./assets/showcase/origin/webpage_1.png
+## scene spoting
+python3 ./demo/demo_vllm.py --prompt_mode prompt_scene_spotting --image_path ./assets/showcase/origin/scene_1.jpg
+## image parsing with svg code
+python3 ./demo/demo_vllm_svg.py --prompt_mode prompt_image_to_svg 
+## general qa
+python3 ./demo/demo_vllm_general.py
+```
+
+### Hugginface inference
+```shell
+python3 demo/demo_hf.py
+```
+
+<details>
+<summary><b>Hugginface inference details</b></summary>
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
+from qwen_vl_utils import process_vision_info
+from dots_mocr.utils import dict_promptmode_to_prompt
+
+model_path = "./weights/DotsMOCR"
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+
+image_path = "demo/demo_image1.jpg"
+prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
+
+1. Bbox format: [x1, y1, x2, y2]
+
+2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
+
+3. Text Extraction & Formatting Rules:
+    - Picture: For the 'Picture' category, the text field should be omitted.
+    - Formula: Format its text as LaTeX.
+    - Table: Format its text as HTML.
+    - All Others (Text, Title, etc.): Format their text as Markdown.
+
+4. Constraints:
+    - The output text must be the original text from the image, with no translation.
+    - All layout elements must be sorted according to human reading order.
+
+5. Final Output: The entire output must be a single JSON object.
+"""
+
+messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image_path
+                },
+                {"type": "text", "text": prompt}
+            ]
+        }
+    ]
+
+# Preparation for inference
+text = processor.apply_chat_template(
+    messages, 
+    tokenize=False, 
+    add_generation_prompt=True
+)
+image_inputs, video_inputs = process_vision_info(messages)
+inputs = processor(
+    text=[text],
+    images=image_inputs,
+    videos=video_inputs,
+    padding=True,
+    return_tensors="pt",
+)
+
+inputs = inputs.to("cuda")
+
+# Inference: Generation of the output
+generated_ids = model.generate(**inputs, max_new_tokens=24000)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+
+```
+
+</details>
+
+### Hugginface inference with CPU
+Please refer to [CPU inference](https://github.com/rednote-hilab/dots.ocr/issues/1#issuecomment-3148962536)
+
+
+## 3. Document Parse
+**Based on vLLM server**, you can parse an image or a pdf file using the following commands:
+```bash
+
+# Parse all layout info, both detection and recognition
+# Parse a single image
+python3 dots_mocr/parser.py demo/demo_image1.jpg
+# Parse a single PDF
+python3 dots_mocr/parser.py demo/demo_pdf1.pdf  --num_thread 64  # try bigger num_threads for pdf with a large number of pages
+
+# Layout detection only
+python3 dots_mocr/parser.py demo/demo_image1.jpg --prompt prompt_layout_only_en
+
+# Parse text only, except Page-header and Page-footer
+python3 dots_mocr/parser.py demo/demo_image1.jpg --prompt prompt_ocr
+
+
+```
+**Based on Transformers**, you can parse an image or a pdf file using the same commands above, just add `--use_hf true`. 
+
+> Notice: transformers is slower than vllm, if you want to use demo/* with transformers，just add `use_hf=True` in `DotsMOCRParser(..,use_hf=True)`
+
+<details>
+<summary><b>Output Results</b></summary>
+
+1.  **Structured Layout Data** (`demo_image1.json`): A JSON file containing the detected layout elements, including their bounding boxes, categories, and extracted text.
+2.  **Processed Markdown File** (`demo_image1.md`): A Markdown file generated from the concatenated text of all detected cells.
+    *   An additional version, `demo_image1_nohf.md`, is also provided, which excludes page headers and footers for compatibility with benchmarks like Omnidocbench and olmOCR-bench.
+3.  **Layout Visualization** (`demo_image1.jpg`): The original image with the detected layout bounding boxes drawn on it.
+
+</details>
+
+
+## 4. Demo
+Have fun with the [live demo](https://dotsocr.xiaohongshu.com/).
+
+
+### Examples for document parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/formula1.png" alt="formula1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/table3.png" alt="table3.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/Tibetan.png" alt="Tibetan.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/tradition_zh.png" alt="tradition_zh.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/nl.png" alt="nl.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/kannada.png" alt="kannada.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/russian.png" alt="russian.png" border="0" />
+
+
+### Examples for image parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_1.png" alt="svg_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_2.png" alt="svg_2.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_4.png" alt="svg_4.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_5.png" alt="svg_5.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_6.png" alt="svg_6.png" border="0" />
+
+> **Note:**
+> - Inferenced by dots.mocr-svg
+
+### Example for web parsing
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/webpage_1.png" alt="webpage_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/webpage_2.png" alt="webpage_2.png" border="0" />
+
+### Examples for scene spotting
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/scene_1.png" alt="scene_1.png" border="0" />
+<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/scene_2.png" alt="scene_2.png" border="0" />
+
+
+# Limitation & Future Work
+
+- **Complex Document Elements:**
+  - **Table&Formula**: The extraction of complex tables and mathematical formulas persists as a difficult task given the model's compact architecture.
+  - **Picture**: We have adopted an SVG code representation for parsing structured graphics; however, the performance has yet to achieve the desired level of robustness.
+
+- **Parsing Failures:** While we have reduced the rate of parsing failures compared to the previous version, these issues may still occur occasionally. We remain committed to further resolving these edge cases in future updates. 
+
+
+# Citation
+
+```BibTeX
+@misc{zheng2026multimodalocrparsedocuments,
+      title={Multimodal OCR: Parse Anything from Documents}, 
+      author={Handong Zheng and Yumeng Li and Kaile Zhang and Liang Xin and Guangwei Zhao and Hao Liu and Jiayu Chen and Jie Lou and Jiyu Qiu and Qi Fu and Rui Yang and Shuo Jiang and Weijian Luo and Weijie Su and Weijun Zhang and Xingyu Zhu and Yabin Li and Yiwei ma and Yu Chen and Zhaohui Yu and Guang Yang and Colin Zhang and Lei Zhang and Yuliang Liu and Xiang Bai},
+      year={2026},
+      eprint={2603.13032},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2603.13032}, 
+}
+```
--- a/chat_template.json
+++ b/chat_template.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for m in messages %}{%- if m.role == 'system' %}{{- '<|system|>' + m.content + '<|endofsystem|>\n' }}{%- elif m.role == 'user' %}{% if m.content is string %}{{- '<|user|>' + m.content + '<|endofuser|>' }}{% else %}{{- '<|user|>' }}{% for content in m.content %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|img|><|imgpad|><|endofimg|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|img|><|video_pad|><|endofimg|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{- '<|endofuser|>' }}{%- endif %}{%- elif m.role == 'assistant' %}{{- '<|assistant|>' + m.content }}{%- if not loop.last %}{{- '<|endofassistant|>' }}{%- endif %}{%- endif %}{%- endfor %}{%- if messages[-1].role != 'assistant' %}{{- '<|assistant|>' }}{%- endif %}"
+}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,51 @@
+{
+    "architectures": [
+        "DotsOCRForCausalLM"
+    ],
+    "model_type": "dots_ocr",
+    "auto_map": {
+        "AutoConfig": "configuration_dots.DotsOCRConfig",
+        "AutoModelForCausalLM": "modeling_dots_ocr.DotsOCRForCausalLM"
+        },
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 8960,
+    "max_position_embeddings": 131072,
+    "max_window_layers": 28,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 2,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "sliding_window": 131072,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.0",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151936,
+    "image_token_id": 151665,
+    "video_token_id": 151656,
+    "vision_config": {
+        "embed_dim": 1536,
+        "hidden_size": 1536,
+        "intermediate_size": 4224,
+        "num_hidden_layers": 42,
+        "num_attention_heads": 12,
+        "num_channels": 3,
+        "patch_size": 14,
+        "post_norm": true,
+        "rms_norm_eps": 1e-05,
+        "spatial_merge_size": 2,
+        "temporal_patch_size": 1,
+        "use_bias": false,
+        "attn_implementation": "flash_attention_2",
+        "init_merger_std": 0.02,
+        "initializer_range": 0.02,
+        "is_causal": false
+    }
+}
--- a/configuration_dots.py
+++ b/configuration_dots.py
@@ -0,0 +1,78 @@
+from typing import Any, Optional
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.qwen2 import Qwen2Config
+from transformers import Qwen2_5_VLProcessor, AutoProcessor
+from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+
+class DotsVisionConfig(PretrainedConfig):
+    model_type: str = "dots_vit"
+
+    def __init__(
+        self,
+        embed_dim: int = 1536,  # vision encoder embed size
+        hidden_size: int = 1536,  # after merger hidden size
+        intermediate_size: int = 4224,
+        num_hidden_layers: int = 42,
+        num_attention_heads: int = 12,
+        num_channels: int = 3,
+        patch_size: int = 14,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 1,
+        rms_norm_eps: float = 1e-5,
+        use_bias: bool = False,
+        attn_implementation="flash_attention_2",  # "eager","sdpa","flash_attention_2"
+        initializer_range=0.02,
+        init_merger_std=0.02,
+        is_causal=False,  # ve causal forward
+        post_norm=True,
+        gradient_checkpointing=False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_bias = use_bias
+        self.attn_implementation = attn_implementation
+        self.initializer_range = initializer_range
+        self.init_merger_std = init_merger_std
+        self.is_causal = is_causal
+        self.post_norm = post_norm
+        self.gradient_checkpointing = gradient_checkpointing
+
+
+
+class DotsOCRConfig(Qwen2Config):
+    model_type = "dots_ocr"
+    def __init__(self, 
+        image_token_id = 151665, 
+        video_token_id = 151656,
+        vision_config: Optional[dict] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_config = DotsVisionConfig(**(vision_config or {}))
+
+    def save_pretrained(self, save_directory, **kwargs):
+        self._auto_class = None
+        super().save_pretrained(save_directory, **kwargs)
+
+
+class DotsVLProcessor(Qwen2_5_VLProcessor):
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        self.image_token = "<|imgpad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.image_token_id = 151665
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.video_token_id = 151656
+
+AutoProcessor.register("dots_ocr", DotsVLProcessor)
+CONFIG_MAPPING.register("dots_ocr", DotsOCRConfig)
--- a/dots.mocr
+++ b/dots.mocr
@@ -0,0 +1,109 @@
+dots.mocr LICENSE AGREEMENT
+
+Effective Date: [ August 8, 2025]
+
+Copyright Holder: [Xingyin Information Technology (Shanghai) Co., Ltd]
+
+This License Agreement (“Agreement”) governs Your use, reproduction, modification, and distribution of dots.mocr (the "Model Materials"). This Agreement is designed to maximize the openness and use of the Model Materials while addressing the unique legal, ethical, and technical challenges posed by large language models.
+
+WHEREAS, Licensor has developed the dots.mocr document parsing model and intends to distribute the Model Materials under an open‑source framework;
+WHEREAS, traditional open-source licenses (e.g., the MIT License) may not fully address the complexity inherent complexities of document parsing models, namely their multiple components (code, weights, training data), potential ethical risks, data‑governance issues, and intellectual‑property and liability questions regarding AI‑generated content;
+WHEREAS, Licensor seeks to provide a legal framework that ensures maximum access to and use of the Model Materials while clearly defining the rights, obligations, and liabilities of Licensee;
+
+THEREFORE, the parties agree that, subject to the MIT License, they shall be bound by the following terms and conditions:
+
+1. Definitions and Interpretation
+Purpose: To define key terms used in this Agreement, particularly "Model Materials," ensuring clarity of the license scope beyond traditional software code. To clarify the order of precedence between this Agreement and the MIT License to avoid conflict.
+
+1.1 “Licensor” shall mean the entity providing the Model Materials under this Agreement, namely [Xingyin Information Technology (Shanghai) Co., Ltd].
+
+1.2 “Licensee” or "You" shall mean any individual or entity exercising permissions granted by this Agreement.
+
+1.3 “Model Materials” shall mean all materials provided by Licensor under this Agreement, including but not limited to:
+        (a) one or more machine‑learning models, including architecture and trained parameters (i.e., model weights);
+        (b) all associated preprocessing, training, inference, and fine‑tuning code;
+        (c) training datasets and evaluation scripts (or their detailed descriptions and access mechanisms); and
+        (d) any accompanying documentation, metadata, and tools.
+The above Model Materials shall be subject to the content published on the Licensor’s website or GitHub repository at https://github.com/rednote-hilab/dots.mocr.
+
+1.4 “Outputs” shall mean any content generated through the use of the Model Materials, such as text, tables, code,layout information, and formulas extracted from documents.
+
+1.5 “MIT License” shall mean The MIT Open Source License published by the Massachusetts Institute of Technology.
+
+1.6   Priority of Agreement. In the event of any conflict or inconsistency between this Agreement and the MIT License, the terms of the MIT License shall prevail. However, if the terms of the MIT License are ambiguous or silent on a particular matter, the provisions of this Agreement shall apply and supplement the MIT License.
+
+2. Grant of Rights and Scope of Use
+
+Purpose: To grant broad, permissive rights to the Licensee for the Model Materials—including code, weights, data, and documentation—to ensure maximum openness and flexibility while clarifying the free use of model-generated content. Additionally, it clarifies the feasibility of transitioning from open-source to commercial‑use and the use of OpenAPI interfaces.
+
+2.1   Grant of Copyright License. Subject to Licensee's compliance with this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non‑exclusive, no-charge, royalty‑free copyright license to use (run or test), reproduce, modify, create derivative works of, merge, publish, distribute the Model Materials; sublicense and/or sell copies of the Model Materials or any derivative works thereof; and incorporate the unmodified or modified Model Materials into proprietary products or services, including for commercial purposes, software‑as‑a‑service (SaaS) offerings, or via OpenAPI or other interfaces.
+
+2.2   Fundamental Capabilities. The Model Materials only provide the fundamental model’s capabilities. Licensees may develop derivative AI applications or undertake task‑specific training thereon.
+
+2.3   From Open Source to Commercial Use. The open-source release does not preclude Licensor’s commercial exploitation of the Model Materials, in whole or in part. Any such commercial use shall, at that time, be subject to license agreements between Licensor and applicable users.
+
+2.4   API‑Service Exception. Licensees who access the Model Materials through API calls or provide model services via API interfaces(without directly distributing model weights )shall not be subject to this Agreement unless otherwise expressly agreed. Instead, such use shall be governed by the API terms of use published by Licensor (if any).
+
+3. Acceptable Use Policy and Prohibited Uses
+
+3.1   Responsible Use. Licensee must use the Model Materials in a responsible, ethical, and lawful manner, in compliance with all applicable laws, regulations, industry standards, and best practices.
+
+3.2   Enterprise On‑Premises Deployment. The Licensee may deploy the Model Materials in closed‑source, on‑premises enterprise environments.
+
+3.3   Prohibited Uses. Any breach of the prohibitions below will result in the automatic termination of all licenses granted under this Agreement. Licensee agrees not to use the Model Materials or any derivative works thereof, in connection with:
+(a) Identification and Utilization of Illegal/Harmful Content:Includes identifying graphic/text materials used for counterfeiting certificates/invoices, perpetrating fraud, or launching cyberattacks; or processing images containing illegal content such as violence, criminal activities, disinformation, or child exploitation.
+(b) Privacy Infringement and Discriminatory Practices:Extracting personal sensitive information (e.g., ID numbers, medical records, biometric data) or protected characteristics (e.g., race, gender) from images without legal authorization or consent, for purposes of privacy violation, automated discriminatory decision-making, or harassment.
+(c) Copyright Restrictions:Licensees shall not use the tool for unauthorized digitization of publications/document scanning or bulk scraping of content. Any use involving publications or other copyright-protected materials must first obtain relevant permissions.
+
+4. Intellectual Property Ownership and Contributions
+
+4.1   Licensor's Copyright Reservation. Licensor reserves all right, title, and interest in and to the Model Materials (including the model architecture, parameters, code, and original training data), except as expressly licensed herein. The original copyright of the Model Materials belongs to the Licensor.
+
+4.2   Patent License. Subject to the terms and conditions of this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model Materials, where such license applies only to those patent claims licensable by the Lisensor that are necessarily infringed by its contribution(s). 
+If Licensee institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model Materials constitute direct or contributory patent infringement, then any patent licenses granted under this License for the Model Materials shall terminate as of the date such litigation is asserted or filed.
+
+4.3   Outputs: The Outputs generated through the use of the Model Materials generally refer to text, tables, layouts, and other content extracted from documents or images. The extracted content itself does not generate new intellectual property rights, and all intellectual property remains with the original authors or copyright holders. The Licensee is responsible for due diligence regarding the legality of the Outputs, particularly where the content extracted by the OCR model may be substantially similar to existing copyrighted works, which could present intellectual property infringement risks. The Licensor assumes no liability for such infringements.
+4.4   Trademarks. Nothing in this License permits Licensee to make use of Licensor’s trademarks, trade names, logos (e.g., “rednote,” “Xiaohongshu,” “dots.mocr”) or to otherwise suggest endorsement or misrepresent the relationship between the parties, unless Licensor’s prior written approval is granted.
+
+5. Data Governance, Privacy, and Security
+ 
+5.1   Data Quality and Bias. Licensee shall use training data from lawful sources and is encouraged to conduct due diligence before deploying the Model Materials and to take reasonable steps to mitigate any known biases in its training data or applications.
+
+5.2   Privacy Protection.
+        (a) Sensitive‑Data Restrictions. It is prohibited to use the Model Materials to process,or extract infer sensitive personal data protected under specific laws (such as GDPR or HIPAA), particularly when dealing with documents containing personally identifiable information (such as ID numbers, health data, financial information, etc.), unless Licensee has obtained all necessary consents, lawful basis, or authorizations, and has implemented adequate anonymization, pseudonymization, or other privacy-enhancing technologies.
+        (b) Data Minimization and Purpose Limitation. The Licensee shall follow the principle of data minimization when using the OCR Model, processing only the user data necessary for specific, explicit, and lawful purposes. Specifically, the OCR Model should avoid processing unnecessary sensitive data and ensure compliance with applicable privacy protection laws during data handling.
+        (c) Transparency. Licensee shall provide clear and transparent privacy policies and terms of use when processing user data, particularly during document scanning and information extraction. . 
+
+5.3   Security Measures. Licensee shall implement appropriate technical and administrative safeguards to protect the Model Materials and any associated data against unauthorized access, disclosure, alteration, or destruction. Such measures may include, but are not limited to, encryption, access controls, logging, and audit trails.
+
+5.4   Further Training. Licensee may only use user‑provided input or Outputs for training, fine-tuning, or improving other AI models if it has obtained the specific and informed consent of data subjects.
+
+6. Disclaimer of Warranty and Limitation of Liability
+
+6.1 “AS IS” Basis. Unless required by applicable law, the Model Materials are provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. Licensee is solely responsible for determining the appropriateness of using or redistributing the Model Materials and assume any risks associated with the exercise of permissions under this License. Licensor does not provide any warranty of non-infringement but represents that no infringing code has been knowingly included.
+
+6.2   Outputs Disclaimer. As a neutral technology, Licensor disclaims all liability for the accuracy, completeness, reliability, safety, legality, or suitability of any Outputs. The Licensee is solely responsible for verifying the accuracy and appropriateness of AI-generated content and shall provide appropriate disclosures when publishing or relying upon such content.
+
+6.3   Limitation of Liability and Recourse. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall Licensor or contributors be liable for any claims, damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model Materials (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Licensor has been advised of the possibility of such damages. If such losses are incurred, recourse may be sought against the Licensee responsible for causing the loss.
+
+6.4   Content‑Filtering Disclaimer. Although the Model Materials may include content‑filtering mechanisms, Licensor makes no warranties of any kind regarding the stability, quality, accuracy, completeness, or any specific outcome of Outputs. Licensee is solely responsible for reviewing, verifying, and performing quality control on Outputs and assumes all associated risks and liabilities.
+
+7. Attribution and License Reservation
+
+7.1   License. When distributing or redistributing the Model Materials, Licensee must give any other recipients of the Model Materials a copy of this Agreement.
+
+7.2   Copyright and Notices. When distributing any part of the Model Materials, Licensee must retain all copyright, patent, trademark, and attribution notices included in the Model Materials.
+
+7.3   Attribution. Licensee is encouraged to prominently display the name of Licensor and the Model Materials in any public statements, products, or services that contain the Model Materials (or any derivative works thereof), to promote transparency and community trust. If Licensee distributes modified weights or fine‑tuned models based on the Model Materials, Licensee must prominently display the following statement in the related website or documentation: “Built with dots.mocr.”
+
+8. Governing Law and Dispute Resolution
+
+8.1   Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the People’s Republic of China, without regard to its conflict of laws principles.
+
+8.2   Dispute Resolution. Any dispute claim, or disagreement arising out of or relating to this Agreement shall first be resolved through amicable consultation. If such consultation fails, the dispute shall be submitted to the Hangzhou Arbitration Commission for arbitration. The arbitration shall be conducted in accordance with the laws of China, and the place of arbitration shall be [Hangzhou, China]. The arbitral award shall be final and binding upon both parties.
+
+9. Regulatory Compliance Amendments
+In the event that any part of this Agreement becomes invalid or requires adjustment due to changes in applicable laws or regulations, Licensor reserves the right to issue a revised version of this Agreement. Licensee shall migrate to the new version within [e.g., ninety (90)] days of its release; otherwise, all rights granted under this Agreement shall automatically terminate.
+
+10. Security Reporting
+Licensee discovering any security vulnerability in the Model Materials may report it to Licensor via: dots-feedback@xiaohongshu.com. Licensee shall not disclose vulnerability details until Licensor issues an official remediation, unless otherwise required by law.
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,8 @@
+{
+  "max_length": 32768,
+  "eos_token_id": [
+    151643,
+    151672,
+    151673
+  ]
+}
--- a/merges.txt
+++ b/merges.txt
--- a/model-00001-of-00002.safetensors
+++ b/model-00001-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8f4bc75340279da003609fe93f2eb02cc1a77087f5dfb6ba46c0980e1b4da81
+size 4998547840
--- a/model-00002-of-00002.safetensors
+++ b/model-00002-of-00002.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0101b36fe6620ba135d4bb8efbbf275fc27b4363b10b7f632c058c0955f3dc4d
+size 1079883896
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,651 @@
+{
+  "metadata": {
+    "total_parameters": 3039179264,
+    "total_size": 6078358528
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.1.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.11.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.12.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.15.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.2.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.20.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.22.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.25.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.26.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.28.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.28.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.29.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.32.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.33.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.33.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.34.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.34.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.35.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.35.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.36.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.37.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.37.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.37.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.38.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.38.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.39.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.39.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.4.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.40.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.40.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.41.norm2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.5.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.6.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.8.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.blocks.9.norm1.weight": "model-00002-of-00002.safetensors",
+    "vision_tower.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.ln_q.bias": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.ln_q.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.merger.mlp.2.bias": "model-00002-of-00002.safetensors",
+    "vision_tower.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.norm.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.proj.bias": "model-00002-of-00002.safetensors",
+    "vision_tower.patch_embed.patchifier.proj.weight": "model-00001-of-00002.safetensors",
+    "vision_tower.post_trunk_norm.weight": "model-00001-of-00002.safetensors"
+  }
+}
--- a/modeling_dots_ocr.py
+++ b/modeling_dots_ocr.py
@@ -0,0 +1,131 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.qwen2 import Qwen2ForCausalLM
+
+from .configuration_dots import DotsVisionConfig, DotsOCRConfig
+from .modeling_dots_vision import DotsVisionTransformer
+
+
+DOTS_VLM_MAX_IMAGES = 200
+
+
+class DotsOCRForCausalLM(Qwen2ForCausalLM):
+    config_class = DotsOCRConfig
+
+    def __init__(self, config: DotsOCRConfig):
+        super().__init__(config)
+
+        if isinstance(self.config.vision_config, dict):
+            vision_config = DotsVisionConfig(**self.config.vision_config)
+            self.config.vision_config = vision_config
+        else:
+            vision_config = self.config.vision_config
+
+        self.vision_tower = DotsVisionTransformer(vision_config)
+
+    def prepare_inputs_embeds(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        grid_thw: Optional[torch.FloatTensor] = None,
+        img_mask: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            assert img_mask is not None
+            if grid_thw.shape[0] > DOTS_VLM_MAX_IMAGES:
+                print(
+                    f"Num image exceeded: {grid_thw.shape[0]} > {DOTS_VLM_MAX_IMAGES}, which may cause FSDP hang"
+                )
+
+            vision_embeddings = self.vision_tower(pixel_values, grid_thw)
+
+            true_indices = torch.nonzero(img_mask).squeeze()
+            if len(true_indices) > vision_embeddings.size(0):
+                print(
+                    f"img_mask sum > VE and will be truncated, mask.sum()={len(true_indices)} {vision_embeddings.size(0)=}"
+                )
+                true_indices = true_indices[: vision_embeddings.size(0)]
+                new_img_mask = torch.zeros_like(img_mask, device=img_mask.device)
+                new_img_mask[true_indices[:, 0], true_indices[:, 1]] = True
+            else:
+                new_img_mask = img_mask
+
+            assert (
+                vision_embeddings.size(0) == new_img_mask.sum()
+            ), f"{vision_embeddings.size(0)=}, {new_img_mask.sum()=}"
+
+            inputs_embeds = inputs_embeds.masked_scatter(
+                new_img_mask.to(inputs_embeds.device).unsqueeze(-1).expand_as(inputs_embeds),
+                vision_embeddings.to(inputs_embeds.device).type(inputs_embeds.dtype),
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        assert len(input_ids) >= 1, f"empty input_ids {input_ids.shape=} will cause gradnorm nan"
+        if inputs_embeds is None:
+            img_mask = input_ids == self.config.image_token_id
+            inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, image_grid_thw, img_mask)
+
+        outputs = super().forward(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            labels=labels,
+            use_cache=use_cache if use_cache is not None else self.config.use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
+            logits_to_keep=logits_to_keep,
+            **loss_kwargs,
+        )
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
--- a/modeling_dots_vision.py
+++ b/modeling_dots_vision.py
@@ -0,0 +1,404 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from flash_attn import flash_attn_varlen_func
+from torch.nn import LayerNorm
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_dots import DotsVisionConfig
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+
+    cos = freqs.cos()
+    sin = freqs.sin()
+
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+
+    output = output.to(orig_dtype)
+
+    return output
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class PatchMerger(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        pre_norm="layernorm",
+        init_merger_std=None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.pre_norm = pre_norm
+        if self.pre_norm == "layernorm":
+            self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        elif self.pre_norm == "rmsnorm":
+            self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        else:
+            print("no norm in patch merger")
+
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+        if init_merger_std is not None:
+            nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[0].bias)
+            nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[2].bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm:
+            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        else:
+            x = self.mlp(x.view(-1, self.hidden_size))
+        return x
+
+
+class VisionAttention(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.config = config
+        self.is_causal = config.is_causal
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )  # 'shd'
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(
+            q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, causal=self.is_causal
+        ).reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+
+        return attn_output
+
+
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=bias)
+        self.proj = nn.Linear(dim, dim, bias=bias)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+
+        attn_output = self.proj(attn_output)
+        return attn_output
+
+
+DOTS_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+
+class DotsSwiGLUFFN(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.embed_dim
+        bias = config.use_bias
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+
+
+
+class DotsPatchEmbed(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.embed_dim,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        x = x.view(-1, self.num_channels, self.temporal_patch_size, self.patch_size, self.patch_size)[:, :, 0] 
+        x = self.proj(x).view(-1, self.embed_dim)
+        x = self.norm(x)
+        return x
+
+
+class DotsViTPreprocessor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.patch_h = config.patch_size
+        self.patch_w = config.patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.patchifier = DotsPatchEmbed(config)
+
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        tokens = self.patchifier(x, grid_thw)
+        return tokens
+
+
+class DotsVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "flash_attention_2"):
+        super().__init__()
+        self.attn = DOTS_VISION_ATTENTION_CLASSES[attn_implementation](
+            config, config.embed_dim, num_heads=config.num_attention_heads, bias=config.use_bias
+        )
+        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.mlp = DotsSwiGLUFFN(config)
+        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class DotsVisionTransformer(PreTrainedModel):
+    def __init__(self, config: DotsVisionConfig) -> None:
+        super().__init__(config)
+        self.config = config
+        self.spatial_merge_size = config.spatial_merge_size
+
+        self.patch_embed = DotsViTPreprocessor(config)
+        self._init_weights(self.patch_embed.patchifier.proj)
+
+        head_dim = config.embed_dim // config.num_attention_heads
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        _num_hidden_layers = config.num_hidden_layers
+        self.blocks = nn.ModuleList(
+            [DotsVisionBlock(config, config.attn_implementation) for _ in range(_num_hidden_layers)]
+        )
+
+        if self.config.post_norm:
+            self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+        self.merger = PatchMerger(
+            dim=config.hidden_size,
+            context_dim=config.embed_dim,
+            spatial_merge_size=config.spatial_merge_size,
+            init_merger_std=self.config.init_merger_std,
+        )
+
+        self.gradient_checkpointing = False
+        self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def get_pos_ids_by_grid(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            )
+
+        return pos_ids
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = self.get_pos_ids_by_grid(grid_thw)
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True) -> torch.Tensor:
+        if bf16:
+            hidden_states = hidden_states.bfloat16()
+        hidden_states = self.patch_embed(hidden_states, grid_thw)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        for blk in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb,
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+
+        if self.config.post_norm:
+            hidden_states = self.post_trunk_norm(hidden_states)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
--- a/preprocessor_config.json
+++ b/preprocessor_config.json
@@ -0,0 +1,22 @@
+{
+  "auto_map": {
+        "AutoProcessor": "configuration_dots.DotsVLProcessor"
+    },
+  "min_pixels": 3136,
+  "max_pixels": 11289600,
+  "patch_size": 14,
+  "temporal_patch_size": 1,
+  "merge_size": 2,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_processor_type": "Qwen2VLImageProcessor",
+  "processor_class": "DotsVLProcessor"
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,25 @@
+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "[PAD]"
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,391 @@
+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|imgpad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|img|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|endofimg|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|systemprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|endofsystemprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|endofuser|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|endofassistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<|pic|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<|text|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<|pictotext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<|slice|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "<|endofslice|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "<|imgrowend|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "<|polygon_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "<|polygon_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "<|image_gen_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "<|image_gen_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- for m in messages %}\n    {%- if m.role == 'system' %}\n        {{- '<|system|>' + m.content + '<|endofsystem|>\\n' }}\n    {%- elif m.role == 'user' %}\n        {{- '<|user|>' + m.content + '<|endofuser|>' }}\n    {%- elif m.role == 'assistant' %}\n        {{- '<|assistant|>' + m.content }}\n        {%- if not loop.last %}\n            {{- '<|endofassistant|>' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if messages[-1].role != 'assistant' %}\n    {{- '<|assistant|>' }}\n{%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "[PAD]",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}
--- a/vocab.json
+++ b/vocab.json