初始化项目,由ModelHub XC社区提供模型

Model: rednote-hilab/dots.mocr
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-22 18:07:20 +08:00
commit 8e91841ff8
22 changed files with 458199 additions and 0 deletions

198
.eval_results/mdpbench.yaml Normal file
View File

@@ -0,0 +1,198 @@
- dataset:
id: Delores-Lin/MDPBench
task_id: overall
value: 80.5
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: digital
value: 90.5
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: photographed
value: 77.2
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: latin
value: 81.7
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: de
value: 82.6
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: en
value: 87.4
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: es
value: 71.3
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: fr
value: 70.1
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: id
value: 84.5
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: it
value: 89.3
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: nl
value: 83.2
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: pt
value: 86.8
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: vi
value: 79.9
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: non_latin
value: 79.2
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: ar
value: 83.3
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: hi
value: 83.6
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: jp
value: 75.0
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: ko
value: 78.7
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: ru
value: 71.2
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: th
value: 77.9
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: zh
value: 84.6
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin
- dataset:
id: Delores-Lin/MDPBench
task_id: zh_t
value: 79.6
date: "2026-04-14"
source:
url: https://huggingface.co/datasets/Delores-Lin/MDPBench
name: MDPBench leaderboard
user: Delores-Lin

View File

@@ -0,0 +1,72 @@
- dataset:
id: allenai/olmOCR-bench
task_id: overall
value: 83.9
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: arxiv_math
value: 85.9
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: old_scans_math
value: 85.5
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: table_tests
value: 90.7
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: old_scans
value: 48.2
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: multi_column
value: 85.3
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: long_tiny_text
value: 81.6
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: headers_footers
value: 94.0
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr
- dataset:
id: allenai/olmOCR-bench
task_id: baseline
value: 99.7
source:
url: https://huggingface.co/papers/2603.13032
name: dots.mocr technical report
user: nielsr

View File

@@ -0,0 +1,60 @@
- dataset:
id: llamaindex/ParseBench
task_id: mean
value: 55.8
date: '2026-03-26'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"
- dataset:
id: llamaindex/ParseBench
task_id: text_content
value: 90.0
date: '2026-03-26'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"
- dataset:
id: llamaindex/ParseBench
task_id: text_formatting
value: 47.0
date: '2026-03-26'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"
- dataset:
id: llamaindex/ParseBench
task_id: layout
value: 55.8
date: '2026-04-09'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"
- dataset:
id: llamaindex/ParseBench
task_id: chart
value: 0.9
date: '2026-04-08'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"
- dataset:
id: llamaindex/ParseBench
task_id: table
value: 85.2
date: '2026-04-08'
source:
url: https://huggingface.co/datasets/llamaindex/ParseBench
name: ParseBench
user: boyang-runllama
notes: "Pipeline name: dots_ocr_1_5_parse"

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

289
NOTICE Normal file
View File

@@ -0,0 +1,289 @@
==================================================================
=============== Copyright Notice and License Texts ===============
==================================================================
------------- LICENSE FOR gradio CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
(a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions.Not withstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
------------- LICENSE FOR openai CODE --------------
Copyright notice:Copyright 2025 OpenAI
License:apache2.0
Please see above.
------------- LICENSE FOR qwen_vl_utils CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR transformers CODE --------------
Copyright notice:Copyright 2018- The Hugging Face team. All rights reserved.
License:apache2.0
Please see above.
------------- LICENSE FOR huggingface_hub CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR flash-attn CODE --------------
Copyright notice:Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
License:BSD-3-Clause license
BSD 3-Clause License
Copyright (c) 2022, the respective contributors, as shown by the AUTHORS file. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this list ofconditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISEDOF THE POSSIBILITY OF SUCH DAMAGE.
------------- LICENSE FOR accelerate CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR OmniDocbench CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR Qwen2.5-VL CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR Hugging Face CODE --------------
Copyright noticeCopyright 2019 Ross Wightman
License:apache2.0
Please see above.
------------- LICENSE FOR vLLM CODE --------------
Copyright notice:No copyright info provided
License:apache2.0
Please see above.
------------- LICENSE FOR Doclaynet --------------
Copyright notice:No copyright info provided
License:Community Data License Agreement
Community Data License Agreement Permissive Version 1.0
This is the Community Data License Agreement Permissive, Version 1.0 (“Agreement”). Data is provided to You under this Agreement by each of the Data Providers. Your exercise of any of the rights and permissions granted below constitutes your acceptance and agreement to be bound by the terms and conditions of this Agreement.
The benefits that each Data Provider receives from making Data available and that You receive from Data or otherwise under these terms and conditions shall be deemed sufficient consideration for the formation of this Agreement. Accordingly, Data Provider(s) and You (the "Parties") agree as follows:
Section 1. Definitions
1.1 "Add" means to supplement Data with Your own or someone else's Data, resulting in Your “Additions.” Additions do not include Results.
1.2 "Computational Use" means Your analysis (through the use of computational devices or otherwise) or other interpretation of Data. By way of example and not limitation, "Computational Use" includes the application of any computational analytical technique, the purpose of which is the analysis of any Data in digital form to generate information about Data such as patterns, trends, correlations, inferences, insights and attributes.
1.3 "Data" means the information (including copyrightable information, such as images or text), collectively or individually, whether created or gathered by a Data Provider or an Entity acting on its behalf, to which rights are granted under this Agreement.
1.4 "Data Provider" means any Entity (including any employee or contractor of such Entity authorized to Publish Data on behalf of such Entity) that Publishes Data under this Agreement prior to Your Receiving it.
1.5 "Enhanced Data" means the subset of Data that You Publish and that is composed of (a) Your Additions and/or (b) Modifications to Data You have received under this Agreement.
1.6 "Entity" means any natural person or organization that exists under the laws of the jurisdiction in which it is organized, together with all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (a) the power, directly or indirectly, to cause the direction or management of such entity, whether by contract or otherwise, (b) the ownership of more than fifty percent (50%) of the outstanding shares or securities, (c) the beneficial ownership of such entity or, (d) the ability to appoint, whether by agreement or right, the majority of directors of an Entity.
1.7 "Modify" means to delete, erase, correct or re-arrange Data, resulting in “Modifications.” Modifications do not include Results.
1.8 "Publish" means to make all or a subset of Data (including Your Enhanced Data) available in any manner which enables its use, including by providing a copy on physical media or remote access. For any form of Entity, that is to make the Data available to any individual who is not employed by that Entity or engaged as a contractor or agent to perform work on that Entity's behalf. A "Publication" occurs each time you Publish Data.
1.9 "Receive" or "Receives" means to have been given access to Data, locally or remotely.
1.10 "Results" means the outcomes or outputs that You obtain from Your Computational Use of Data. Results shall not include more than a de minimis portion of the Data on which the Computational Use is based.
1.11 "Sui Generis Database Rights" means rights, other than copyright, resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other equivalent rights anywhere in the world.
1.12 "Use" means using Data (including accessing, copying, studying, reviewing, adapting, analyzing, evaluating, or making Computational Use of it), either by machines or humans, or a combination of both.
1.13 "You" or "Your" means any Entity that Receives Data under this Agreement.
Section 2. Right and License to Use and to Publish
2.1 Subject to the conditions set forth in Section 3 of this Agreement, Data Provider(s) hereby grant(s) to You a worldwide, non-exclusive, irrevocable (except as provided in Section 5) right to: (a) Use Data; and (b) Publish Data.
2.2 To the extent that the Data or the coordination, selection or arrangement of Data is protected or protectable under copyright, Sui Generis Database Rights, or other law, Data Provider(s) further agree(s) that such Data or coordination, selection or arrangement is hereby licensed to You and to anyone else who Receives Data under this Agreement for Use and Publication, subject to the conditions set forth in Section 3 of this Agreement.
2.3 Except for these rights and licenses expressly granted, no other intellectual property rights are granted or should be implied.
Section 3. Conditions on Rights Granted
3.1 If You Publish Data You Receive or Enhanced Data:
(a) You may do so under a license of your choice provided that you give anyone who receives the data from you the text of this Agreement, the name of this Agreement and/or a hyperlink or other method reasonably likely to provide a copy of the text of this Agreement; and
(b) You must cause any Data files containing Enhanced Data to carry prominent notices that you have changed those files; and
(c) If You Publish Data You Receive, You must preserve all credit or attribution to the Data Provider(s). Such retained credit or attribution includes any of the following to the extent they exist in the Data as You have Received it: legal notices or metadata; identification of the Data Provider(s); or hyperlinks to Data to the extent it is practical to do so.
3.2 You may provide additional or different license terms and conditions for use, reproduction, or distribution of that Enhanced Data, or for any combination of Data and Enhanced Data as a whole, provided that Your Use and Publication of that combined Data otherwise complies with the conditions stated in this License.
3.3 You and each Data Provider agree that Enhanced Data shall not be considered a work of joint authorship by virtue of its relationship to Data licensed under this Agreement and shall not require either any obligation of accounting to or the consent of any Data Provider.
3.4 This Agreement imposes no obligations or restrictions on Your Use or Publication of Results.
Section 4. Data Provider(s)' Representations
4.1 Each Data Provider represents that the Data Provider has exercised reasonable care, to assure that: (a) the Data it Publishes was created or generated by it or was obtained from others with the right to Publish the Data under this Agreement; and (b) Publication of such Data does not violate any privacy or confidentiality obligation undertaken by the Data Provider.
Section 5. Termination
5.1 All of Your rights under this Agreement will terminate, and Your right to Receive, Use or Publish the Data will be revoked or modified if You materially fail to comply with the terms and conditions of this Agreement and You do not cure such failure in a reasonable period of time after becoming aware of such noncompliance. If your rights under this Agreement terminate, you agree to cease Receipt, Use and Publication of Data. However, your obligations and any rights and permissions granted by you under this Agreement relating to Data that you published prior to such termination will continue and survive.
5.2 If you institute litigation against a Data Provider or anyone else who Receives the Data (including a cross-claim in a lawsuit) based on the Data, other than a claim asserting breach of this Agreement, then any rights previously granted to You to Receive, Use and Publish Data under this Agreement will terminate as of the date such litigation is filed.
Section 6. Disclaimer of Warranties and Limitation of Liability
6.1 EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE DATA (INCLUDING ENHANCED DATA) IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
6.2 NEITHER YOU NOR ANY DATA PROVIDERS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE DATA OR THE EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
Section 7. Miscellaneous
7.1 You agree that it is solely your responsibility to comply with all applicable laws with regard to Your Use or Publication of Data, including any applicable privacy, data protection, security and export laws. You agree to take reasonable steps to assist a Data Provider fulfilling responsibilities to comply with applicable laws with regard to Use or Publication of Data Received hereunder.
7.2 You and Data Provider(s), collectively and individually, waive and/or agree not to assert, to the extent permitted by law, any moral rights you or they hold in Data.
7.3 This Agreement confers no rights or remedies upon any person or entity other than the Parties and their respective heirs, executors, successors and assigns.
7.4 The Data Provider(s) reserve no right or expectation of privacy, data protection or confidentiality in any Data that they Publish under this Agreement. If you choose to Publish Data under this Agreement, you similarly do so with no reservation or expectation of any rights of privacy or confidentiality in that Data.
7.5 The Community Data License Agreement workgroup under The Linux Foundation is the steward of this Agreement (“Steward”). No one other than the Steward has the right to modify or publish new versions of this Agreement. Each version will be given a distinguishing version number. You may Use and Publish Data Received hereunder under the terms of the version of the Agreement under which You originally Received the Data, or under the terms of any subsequent version published by the Steward.

788
README.md Normal file
View File

@@ -0,0 +1,788 @@
---
license: mit
library_name: dots_mocr
pipeline_tag: image-text-to-text
tags:
- image-to-text
- ocr
- document-parse
- layout
- table
- formula
- transformers
- custom_code
language:
- en
- zh
- multilingual
---
<div align="center">
<h1 align="center">
dots.mocr
</h1>
[![HuggingFace](https://img.shields.io/badge/HuggingFace%20Weights-black.svg?logo=HuggingFace)](https://huggingface.co/rednote-hilab/dots.mocr)
[![GitHub](https://img.shields.io/badge/GitHub-Repository-black?logo=github)](https://github.com/rednote-hilab/dots.mocr)
[![arXiv](https://img.shields.io/badge/arXiv-Paper-b31b1b.svg?logo=arxiv)](https://arxiv.org/abs/2603.13032v1)
<div align="center">
<a href="https://dotsocr.xiaohongshu.com" target="_blank" rel="noopener noreferrer"><strong>🖥️ Live Demo</strong></a> |
<a href="https://raw.githubusercontent.com/rednote-hilab/dots.ocr/master/assets/wechat.jpg" target="_blank" rel="noopener noreferrer"><strong>💬 WeChat</strong></a> |
<a href="https://www.xiaohongshu.com/user/profile/683ffe42000000001d021a4c" target="_blank" rel="noopener noreferrer"><strong>📕 rednote</strong></a> |
<a href="https://x.com/rednotehilab" target="_blank" rel="noopener noreferrer"><strong>🐦 X</strong></a>
</div>
</div>
## Introduction
We present [dots.mocr](https://huggingface.co/rednote-hilab/dots.mocr). Beyond achieving state-of-the-art (SOTA) performance in standard multilingual document parsing among models of comparable size, **dots.mocr** excels at converting structured graphics (e.g., charts, UI layouts, scientific figures and etc.) directly into SVG code. Its core capabilities encompass grounding, recognition, semantic understanding, and interactive dialogue.
Simultaneously, we are releasing [dots.mocr-svg](https://huggingface.co/rednote-hilab/dots.mocr-svg), a variant specifically optimized for robust image-to-SVG parsing tasks.
More information can be found [in the paper](https://arxiv.org/abs/2603.13032v1).
## Evaluation
### 1. Document Parsing
#### 1.1 Elo Score of different bench between latest models
<table>
<thead>
<tr>
<th>models</th>
<th>olmOCR-Bench</th>
<th>OmniDocBench (v1.5)</th>
<th>XDocParse</th>
<th>Average</th>
</tr>
</thead>
<tbody>
<tr>
<td>MonkeyOCR-pro-3B</td>
<td>895.0</td>
<td>811.3</td>
<td>637.1</td>
<td>781.1</td>
</tr>
<tr>
<td>GLM-OCR</td>
<td>884.2</td>
<td>972.6</td>
<td>820.7</td>
<td>892.5</td>
</tr>
<tr>
<td>PaddleOCR-VL-1.5</td>
<td>897.3</td>
<td>997.9</td>
<td>866.4</td>
<td>920.5</td>
</tr>
<tr>
<td>HuanyuanOCR</td>
<td>997.6</td>
<td>1003.9</td>
<td>951.1</td>
<td>984.2</td>
</tr>
<tr>
<td>dots.ocr</td>
<td>1041.1</td>
<td>1027.2</td>
<td>1190.3</td>
<td>1086.2</td>
</tr>
<!-- Highlighting dots.mocr row with bold tags -->
<tr>
<td><strong>dots.mocr</strong></td>
<td><strong>1104.4</strong></td>
<td><strong>1059.0</strong></td>
<td><strong>1210.7</strong></td>
<td><strong>1124.7</strong></td>
</tr>
<tr>
<td>Gemini 3 Pro</td>
<td>1180.4</td>
<td>1128.0</td>
<td>1323.7</td>
<td>1210.7</td>
</tr>
</tbody>
</table>
> **Notes:**
> - Results for Gemini 3 Pro, PaddleOCR-VL-1.5, and GLM-OCR were obtained via APIs, while HuanyuanOCR results were generated using local inference.
> - The Elo score evaluation was conducted using Gemini 3 Flash. The prompt can be found at: [Elo Score Prompt](https://github.com/rednote-hilab/dots.mocr/blob/master/tools/elo_score_prompt.py). These results are consistent with the findings on [ocrarena](https://www.ocrarena.ai/battle).
#### 1.2 olmOCR-bench
<table>
<thead>
<tr>
<th>Model</th>
<th>ArXiv</th>
<th>Old scans math</th>
<th>Tables</th>
<th>Old scans</th>
<th>Headers & footers</th>
<th>Multi column</th>
<th>Long tiny text</th>
<th>Base</th>
<th>Overall</th>
</tr>
</thead>
<tbody>
<tr>
<td>Mistral OCR API</td>
<td>77.2</td>
<td>67.5</td>
<td>60.6</td>
<td>29.3</td>
<td>93.6</td>
<td>71.3</td>
<td>77.1</td>
<td>99.4</td>
<td>72.0±1.1</td>
</tr>
<tr>
<td>Marker 1.10.1</td>
<td>83.8</td>
<td>66.8</td>
<td>72.9</td>
<td>33.5</td>
<td>86.6</td>
<td>80.0</td>
<td>85.7</td>
<td>99.3</td>
<td>76.1±1.1</td>
</tr>
<tr>
<td>MinerU 2.5.4*</td>
<td>76.6</td>
<td>54.6</td>
<td>84.9</td>
<td>33.7</td>
<td>96.6</td>
<td>78.2</td>
<td>83.5</td>
<td>93.7</td>
<td>75.2±1.1</td>
</tr>
<tr>
<td>DeepSeek-OCR</td>
<td>77.2</td>
<td>73.6</td>
<td>80.2</td>
<td>33.3</td>
<td>96.1</td>
<td>66.4</td>
<td>79.4</td>
<td>99.8</td>
<td>75.7±1.0</td>
</tr>
<tr>
<td>Nanonets-OCR2-3B</td>
<td>75.4</td>
<td>46.1</td>
<td>86.8</td>
<td>40.9</td>
<td>32.1</td>
<td>81.9</td>
<td>93.0</td>
<td>99.6</td>
<td>69.5±1.1</td>
</tr>
<tr>
<td>PaddleOCR-VL*</td>
<td>85.7</td>
<td>71.0</td>
<td>84.1</td>
<td>37.8</td>
<td>97.0</td>
<td>79.9</td>
<td>85.7</td>
<td>98.5</td>
<td>80.0±1.0</td>
</tr>
<tr>
<td>Infinity-Parser 7B*</td>
<td>84.4</td>
<td>83.8</td>
<td>85.0</td>
<td>47.9</td>
<td>88.7</td>
<td>84.2</td>
<td>86.4</td>
<td>99.8</td>
<td>82.5±?</td>
</tr>
<tr>
<td>olmOCR v0.4.0</td>
<td>83.0</td>
<td>82.3</td>
<td>84.9</td>
<td>47.7</td>
<td>96.1</td>
<td>83.7</td>
<td>81.9</td>
<td>99.7</td>
<td>82.4±1.1</td>
</tr>
<tr>
<td>Chandra OCR 0.1.0*</td>
<td>82.2</td>
<td>80.3</td>
<td>88.0</td>
<td>50.4</td>
<td>90.8</td>
<td>81.2</td>
<td>92.3</td>
<td>99.9</td>
<td>83.1±0.9</td>
</tr>
<tr>
<td>dots.ocr</td>
<td>82.1</td>
<td>64.2</td>
<td>88.3</td>
<td>40.9</td>
<td>94.1</td>
<td>82.4</td>
<td>81.2</td>
<td>99.5</td>
<td>79.1±1.0</td>
</tr>
<tr>
<td><strong>dots.mocr</strong></td>
<td><strong>85.9</strong></td>
<td><strong>85.5</strong></td>
<td><strong>90.7</strong></td>
<td>48.2</td>
<td>94.0</td>
<td><strong>85.3</strong></td>
<td>81.6</td>
<td>99.7</td>
<td><strong>83.9±0.9</strong></td>
</tr>
</tbody>
</table>
> **Note:**
> - The metrics are from [olmocr](https://github.com/allenai/olmocr), and our own internal evaluations.
> - We delete the Page-header and Page-footer cells in the result markdown.
#### 1.3 Other Benchmarks
<table>
<thead>
<tr>
<th>Model Type</th>
<th>Methods</th>
<th>Size</th>
<th>OmniDocBench(v1.5)<br>TextEdit↓</th>
<th>OmniDocBench(v1.5)<br>Read OrderEdit↓</th>
<th>pdf-parse-bench</th>
</tr>
</thead>
<tbody>
<!-- GeneralVLMs Group (Reversed Order, 3 rows) -->
<tr>
<td rowspan="3"><strong>GeneralVLMs</strong></td>
<td>Gemini-2.5 Pro</td>
<td>-</td>
<td>0.075</td>
<td>0.097</td>
<td>9.06</td>
</tr>
<tr>
<td>Qwen3-VL-235B-A22B-Instruct</td>
<td>235B</td>
<td>0.069</td>
<td>0.068</td>
<td><strong>9.71</strong></td>
</tr>
<tr>
<td>gemini3pro</td>
<td>-</td>
<td>0.066</td>
<td>0.079</td>
<td>9.68</td>
</tr>
<!-- SpecializedVLMs Group (Reversed Order, 12 rows) -->
<tr>
<td rowspan="12"><strong>SpecializedVLMs</strong></td>
<td>Mistral OCR</td>
<td>-</td>
<td>0.164</td>
<td>0.144</td>
<td>8.84</td>
</tr>
<tr>
<td>Deepseek-OCR</td>
<td>3B</td>
<td>0.073</td>
<td>0.086</td>
<td>8.26</td>
</tr>
<tr>
<td>MonkeyOCR-3B</td>
<td>3B</td>
<td>0.075</td>
<td>0.129</td>
<td>9.27</td>
</tr>
<tr>
<td>OCRVerse</td>
<td>4B</td>
<td>0.058</td>
<td>0.071</td>
<td>--</td>
</tr>
<tr>
<td>MonkeyOCR-pro-3B</td>
<td>3B</td>
<td>0.075</td>
<td>0.128</td>
<td>-</td>
</tr>
<tr>
<td>MinerU2.5</td>
<td>1.2B</td>
<td>0.047</td>
<td>0.044</td>
<td>-</td>
</tr>
<tr>
<td>PaddleOCR-VL</td>
<td>0.9B</td>
<td>0.035</td>
<td>0.043</td>
<td>9.51</td>
</tr>
<tr>
<td>HunyuanOCR</td>
<td>0.9B</td>
<td>0.042</td>
<td>-</td>
<td>-</td>
</tr>
<tr>
<td>PaddleOCR-VL1.5</td>
<td>0.9B</td>
<td>0.035</td>
<td>0.042</td>
<td>-</td>
</tr>
<tr>
<td>GLMOCR</td>
<td>0.9B</td>
<td>0.04</td>
<td>0.043</td>
<td>-</td>
</tr>
<tr>
<td>dots.ocr</td>
<td>3B</td>
<td>0.048</td>
<td>0.053</td>
<td>9.29</td>
</tr>
<tr>
<td><u><strong>dots.mocr</strong></u></td>
<td>3B</td>
<td><strong>0.031</strong></td>
<td><strong>0.029</strong></td>
<td>9.54</td>
</tr>
</tbody>
</table>
> **Note:**
> - Metrics are sourced from [OmniDocBench](https://github.com/opendatalab/OmniDocBench) and other model publications. [pdf-parse-bench](https://github.com/phorn1/pdf-parse-bench) results are reproduced by Qwen3-VL-235B-A22B-Instruct.
> - Formula and Table metrics for OmniDocBench1.5 are omitted due to their high sensitivity to detection and matching protocols.
### 2. Structured Graphics Parsing
Visual languages (e.g., charts, graphics, chemical formulas, logos) encapsulate dense human knowledge. **dots.mocr** unifies the interpretation of these elements by parsing them directly into **SVG code**.
<table>
<thead>
<tr>
<th rowspan="2" style="text-align: left;">Methods</th>
<th colspan="3">Unisvg</th>
<th rowspan="2">Chartmimic</th>
<th rowspan="2">Design2Code</th>
<th rowspan="2">Genexam</th>
<th rowspan="2">SciGen</th>
<th rowspan="2">ChemDraw</th>
</tr>
<tr>
<th>Low-Level</th>
<th>High-Level</th>
<th>Score</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: left;">OCRVerse</td>
<td>0.632</td>
<td>0.852</td>
<td>0.763</td>
<td>0.799</td>
<td>-</td>
<td>-</td>
<td>-</td>
<td>0.881</td>
</tr>
<tr>
<td style="text-align: left;">Gemini 3 Pro</td>
<td>0.563</td>
<td>0.850</td>
<td>0.735</td>
<td>0.788</td>
<td>0.760</td>
<td>0.756</td>
<td>0.783</td>
<td>0.839</td>
</tr>
<tr>
<td style="text-align: left;">dots.mocr</td>
<td>0.850</td>
<td>0.923</td>
<td>0.894</td>
<td>0.772</td>
<td>0.801</td>
<td>0.664</td>
<td>0.660</td>
<td>0.790</td>
</tr>
<tr>
<td style="text-align: left;"><strong>dots.mocr-svg</strong></td>
<td><strong>0.860</strong></td>
<td><strong>0.931</strong></td>
<td><strong>0.902</strong></td>
<td><strong>0.905</strong></td>
<td><strong>0.834</strong></td>
<td><strong>0.8</strong></td>
<td><strong>0.797</strong></td>
<td><strong>0.901</strong></td>
</tr>
</tbody>
</table>
> **Note:**
> - We use the ISVGEN metric from [UniSVG](https://ryanlijinke.github.io/) to evaluate the parsing result. For benchmarks that do not natively support image parsing, we use the original images as input, and calculate the ISVGEN score between the rendered output and the original image.
> - [OCRVerse](https://github.com/DocTron-hub/OCRVerse) results are derived from various code formats (e.g., SVG, Python), whereas results for Gemini 3 Pro and dots.mocr are based specifically on SVG code.
> - Due to the capacity constraints of a 3B-parameter VLM, dots.mocr may not excel in all tasks yet like svg. To complement this, we are simultaneously releasing dots.mocr-svg. We plan to further address these limitations in future updates.
### 3. General Vision Tasks
<table>
<thead>
<tr>
<th>Model</th>
<th>CharXiv_descriptive</th>
<th>CharXiv_reasoning</th>
<th>OCR_Reasoning</th>
<th>infovqa</th>
<th>docvqa</th>
<th>ChartQA</th>
<th>OCRBench</th>
<th>AI2D</th>
<th>CountBenchQA</th>
<th>refcoco</th>
</tr>
</thead>
<tbody>
<tr>
<td>Qwen3vl-2b-instruct</td>
<td>62.3</td>
<td>26.8</td>
<td>-</td>
<td>72.4</td>
<td>93.3</td>
<td>-</td>
<td>85.8</td>
<td>76.9</td>
<td>88.4</td>
<td>-</td>
</tr>
<tr>
<td>Qwen3vl-4b-instruct</td>
<td>76.2</td>
<td>39.7</td>
<td>-</td>
<td>80.3</td>
<td>95.3</td>
<td>-</td>
<td>88.1</td>
<td>84.1</td>
<td>84.9</td>
<td>-</td>
</tr>
<tr>
<td><strong>dots.mocr</strong></td>
<td>77.4</td>
<td>55.3</td>
<td>22.85</td>
<td>73.76</td>
<td>91.85</td>
<td>83.2</td>
<td>86.0</td>
<td>82.16</td>
<td>94.46</td>
<td>80.03</td>
</tr>
</tbody>
</table>
# Quick Start
## 1. Installation
### Install dots.mocr
```shell
conda create -n dots_mocr python=3.12
conda activate dots_mocr
git clone https://github.com/rednote-hilab/dots.mocr.git
cd dots.mocr
# Install pytorch, see https://pytorch.org/get-started/previous-versions/ for your cuda version
# pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu128
# install flash-attn==2.8.0.post2 for faster inference
pip install -e .
```
If you have trouble with the installation, try our [Docker Image](https://hub.docker.com/r/rednotehilab/dots.ocr) for an easier setup, and follow these steps:
### Download Model Weights
> 💡**Note:** Please use a directory name without periods (e.g., `DotsMOCR` instead of `dots.mocr`) for the model save path. This is a temporary workaround pending our integration with Transformers.
```shell
python3 tools/download_model.py
# with modelscope
python3 tools/download_model.py --type modelscope
```
## 2. Deployment
### vLLM inference
We highly recommend using vLLM for deployment and inference. **Since vLLM version 0.11.0, Dots OCR has been officially integrated into vLLM with verified performance** and you can use vLLM docker image directly (e.g, `vllm/vllm-openai:v0.11.0`) to deploy the model server.
```shell
# Launch vLLM model server
## dots.mocr
CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.mocr --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
## dots.mocr-svg
CUDA_VISIBLE_DEVICES=0 vllm serve rednote-hilab/dots.mocr-svg --tensor-parallel-size 1 --gpu-memory-utilization 0.9 --chat-template-content-format string --served-model-name model --trust-remote-code
# vLLM API Demo
# See dots_mocr/model/inference.py and dots_mocr/utils/prompts.py for details on parameter and prompt settings
# that help achieve the best output quality.
## document parsing
python3 ./demo/demo_vllm.py --prompt_mode prompt_layout_all_en
## web parsing
python3 ./demo/demo_vllm.py --prompt_mode prompt_web_parsing --image_path ./assets/showcase/origin/webpage_1.png
## scene spoting
python3 ./demo/demo_vllm.py --prompt_mode prompt_scene_spotting --image_path ./assets/showcase/origin/scene_1.jpg
## image parsing with svg code
python3 ./demo/demo_vllm_svg.py --prompt_mode prompt_image_to_svg
## general qa
python3 ./demo/demo_vllm_general.py
```
### Hugginface inference
```shell
python3 demo/demo_hf.py
```
<details>
<summary><b>Hugginface inference details</b></summary>
```python
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
from qwen_vl_utils import process_vision_info
from dots_mocr.utils import dict_promptmode_to_prompt
model_path = "./weights/DotsMOCR"
model = AutoModelForCausalLM.from_pretrained(
model_path,
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
image_path = "demo/demo_image1.jpg"
prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.
1. Bbox format: [x1, y1, x2, y2]
2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].
3. Text Extraction & Formatting Rules:
- Picture: For the 'Picture' category, the text field should be omitted.
- Formula: Format its text as LaTeX.
- Table: Format its text as HTML.
- All Others (Text, Title, etc.): Format their text as Markdown.
4. Constraints:
- The output text must be the original text from the image, with no translation.
- All layout elements must be sorted according to human reading order.
5. Final Output: The entire output must be a single JSON object.
"""
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image_path
},
{"type": "text", "text": prompt}
]
}
]
# Preparation for inference
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=24000)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
```
</details>
### Hugginface inference with CPU
Please refer to [CPU inference](https://github.com/rednote-hilab/dots.ocr/issues/1#issuecomment-3148962536)
## 3. Document Parse
**Based on vLLM server**, you can parse an image or a pdf file using the following commands:
```bash
# Parse all layout info, both detection and recognition
# Parse a single image
python3 dots_mocr/parser.py demo/demo_image1.jpg
# Parse a single PDF
python3 dots_mocr/parser.py demo/demo_pdf1.pdf --num_thread 64 # try bigger num_threads for pdf with a large number of pages
# Layout detection only
python3 dots_mocr/parser.py demo/demo_image1.jpg --prompt prompt_layout_only_en
# Parse text only, except Page-header and Page-footer
python3 dots_mocr/parser.py demo/demo_image1.jpg --prompt prompt_ocr
```
**Based on Transformers**, you can parse an image or a pdf file using the same commands above, just add `--use_hf true`.
> Notice: transformers is slower than vllm, if you want to use demo/* with transformersjust add `use_hf=True` in `DotsMOCRParser(..,use_hf=True)`
<details>
<summary><b>Output Results</b></summary>
1. **Structured Layout Data** (`demo_image1.json`): A JSON file containing the detected layout elements, including their bounding boxes, categories, and extracted text.
2. **Processed Markdown File** (`demo_image1.md`): A Markdown file generated from the concatenated text of all detected cells.
* An additional version, `demo_image1_nohf.md`, is also provided, which excludes page headers and footers for compatibility with benchmarks like Omnidocbench and olmOCR-bench.
3. **Layout Visualization** (`demo_image1.jpg`): The original image with the detected layout bounding boxes drawn on it.
</details>
## 4. Demo
Have fun with the [live demo](https://dotsocr.xiaohongshu.com/).
### Examples for document parsing
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/formula1.png" alt="formula1.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/table3.png" alt="table3.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/Tibetan.png" alt="Tibetan.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/tradition_zh.png" alt="tradition_zh.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/nl.png" alt="nl.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/kannada.png" alt="kannada.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/russian.png" alt="russian.png" border="0" />
### Examples for image parsing
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_1.png" alt="svg_1.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_2.png" alt="svg_2.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_4.png" alt="svg_4.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_5.png" alt="svg_5.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/svg_6.png" alt="svg_6.png" border="0" />
> **Note:**
> - Inferenced by dots.mocr-svg
### Example for web parsing
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/webpage_1.png" alt="webpage_1.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/webpage_2.png" alt="webpage_2.png" border="0" />
### Examples for scene spotting
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/scene_1.png" alt="scene_1.png" border="0" />
<img src="https://raw.githubusercontent.com/rednote-hilab/dots.mocr/master/assets/showcase/result/scene_2.png" alt="scene_2.png" border="0" />
# Limitation & Future Work
- **Complex Document Elements:**
- **Table&Formula**: The extraction of complex tables and mathematical formulas persists as a difficult task given the model's compact architecture.
- **Picture**: We have adopted an SVG code representation for parsing structured graphics; however, the performance has yet to achieve the desired level of robustness.
- **Parsing Failures:** While we have reduced the rate of parsing failures compared to the previous version, these issues may still occur occasionally. We remain committed to further resolving these edge cases in future updates.
# Citation
```BibTeX
@misc{zheng2026multimodalocrparsedocuments,
title={Multimodal OCR: Parse Anything from Documents},
author={Handong Zheng and Yumeng Li and Kaile Zhang and Liang Xin and Guangwei Zhao and Hao Liu and Jiayu Chen and Jie Lou and Jiyu Qiu and Qi Fu and Rui Yang and Shuo Jiang and Weijian Luo and Weijie Su and Weijun Zhang and Xingyu Zhu and Yabin Li and Yiwei ma and Yu Chen and Zhaohui Yu and Guang Yang and Colin Zhang and Lei Zhang and Yuliang Liu and Xiang Bai},
year={2026},
eprint={2603.13032},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2603.13032},
}
```

3
chat_template.json Normal file
View File

@@ -0,0 +1,3 @@
{
"chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{%- for m in messages %}{%- if m.role == 'system' %}{{- '<|system|>' + m.content + '<|endofsystem|>\n' }}{%- elif m.role == 'user' %}{% if m.content is string %}{{- '<|user|>' + m.content + '<|endofuser|>' }}{% else %}{{- '<|user|>' }}{% for content in m.content %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|img|><|imgpad|><|endofimg|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|img|><|video_pad|><|endofimg|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}{{- '<|endofuser|>' }}{%- endif %}{%- elif m.role == 'assistant' %}{{- '<|assistant|>' + m.content }}{%- if not loop.last %}{{- '<|endofassistant|>' }}{%- endif %}{%- endif %}{%- endfor %}{%- if messages[-1].role != 'assistant' %}{{- '<|assistant|>' }}{%- endif %}"
}

51
config.json Normal file
View File

@@ -0,0 +1,51 @@
{
"architectures": [
"DotsOCRForCausalLM"
],
"model_type": "dots_ocr",
"auto_map": {
"AutoConfig": "configuration_dots.DotsOCRConfig",
"AutoModelForCausalLM": "modeling_dots_ocr.DotsOCRForCausalLM"
},
"attention_bias": true,
"attention_dropout": 0.0,
"hidden_act": "silu",
"hidden_size": 1536,
"initializer_range": 0.02,
"intermediate_size": 8960,
"max_position_embeddings": 131072,
"max_window_layers": 28,
"num_attention_heads": 12,
"num_hidden_layers": 28,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": 131072,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.51.0",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936,
"image_token_id": 151665,
"video_token_id": 151656,
"vision_config": {
"embed_dim": 1536,
"hidden_size": 1536,
"intermediate_size": 4224,
"num_hidden_layers": 42,
"num_attention_heads": 12,
"num_channels": 3,
"patch_size": 14,
"post_norm": true,
"rms_norm_eps": 1e-05,
"spatial_merge_size": 2,
"temporal_patch_size": 1,
"use_bias": false,
"attn_implementation": "flash_attention_2",
"init_merger_std": 0.02,
"initializer_range": 0.02,
"is_causal": false
}
}

78
configuration_dots.py Normal file
View File

@@ -0,0 +1,78 @@
from typing import Any, Optional
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2 import Qwen2Config
from transformers import Qwen2_5_VLProcessor, AutoProcessor
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
class DotsVisionConfig(PretrainedConfig):
model_type: str = "dots_vit"
def __init__(
self,
embed_dim: int = 1536, # vision encoder embed size
hidden_size: int = 1536, # after merger hidden size
intermediate_size: int = 4224,
num_hidden_layers: int = 42,
num_attention_heads: int = 12,
num_channels: int = 3,
patch_size: int = 14,
spatial_merge_size: int = 2,
temporal_patch_size: int = 1,
rms_norm_eps: float = 1e-5,
use_bias: bool = False,
attn_implementation="flash_attention_2", # "eager","sdpa","flash_attention_2"
initializer_range=0.02,
init_merger_std=0.02,
is_causal=False, # ve causal forward
post_norm=True,
gradient_checkpointing=False,
**kwargs: Any,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.rms_norm_eps = rms_norm_eps
self.use_bias = use_bias
self.attn_implementation = attn_implementation
self.initializer_range = initializer_range
self.init_merger_std = init_merger_std
self.is_causal = is_causal
self.post_norm = post_norm
self.gradient_checkpointing = gradient_checkpointing
class DotsOCRConfig(Qwen2Config):
model_type = "dots_ocr"
def __init__(self,
image_token_id = 151665,
video_token_id = 151656,
vision_config: Optional[dict] = None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.vision_config = DotsVisionConfig(**(vision_config or {}))
def save_pretrained(self, save_directory, **kwargs):
self._auto_class = None
super().save_pretrained(save_directory, **kwargs)
class DotsVLProcessor(Qwen2_5_VLProcessor):
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
self.image_token = "<|imgpad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
self.image_token_id = 151665
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
self.video_token_id = 151656
AutoProcessor.register("dots_ocr", DotsVLProcessor)
CONFIG_MAPPING.register("dots_ocr", DotsOCRConfig)

109
dots.mocr LICENSE AGREEMENT Normal file
View File

@@ -0,0 +1,109 @@
dots.mocr LICENSE AGREEMENT
Effective Date: [ August 8, 2025]
Copyright Holder: [Xingyin Information Technology (Shanghai) Co., Ltd]
This License Agreement (“Agreement”) governs Your use, reproduction, modification, and distribution of dots.mocr (the "Model Materials"). This Agreement is designed to maximize the openness and use of the Model Materials while addressing the unique legal, ethical, and technical challenges posed by large language models.
WHEREAS, Licensor has developed the dots.mocr document parsing model and intends to distribute the Model Materials under an opensource framework;
WHEREAS, traditional open-source licenses (e.g., the MIT License) may not fully address the complexity inherent complexities of document parsing models, namely their multiple components (code, weights, training data), potential ethical risks, datagovernance issues, and intellectualproperty and liability questions regarding AIgenerated content;
WHEREAS, Licensor seeks to provide a legal framework that ensures maximum access to and use of the Model Materials while clearly defining the rights, obligations, and liabilities of Licensee;
THEREFORE, the parties agree that, subject to the MIT License, they shall be bound by the following terms and conditions:
1.Definitions and Interpretation
Purpose: To define key terms used in this Agreement, particularly "Model Materials," ensuring clarity of the license scope beyond traditional software code. To clarify the order of precedence between this Agreement and the MIT License to avoid conflict.
1.1“Licensor” shall mean the entity providing the Model Materials under this Agreement, namely [Xingyin Information Technology (Shanghai) Co., Ltd].
1.2“Licensee” or "You"shall mean any individual or entity exercising permissions granted by this Agreement.
1.3“Model Materials”shall mean all materials provided by Licensor under this Agreement, including but not limited to:
(a)one or more machinelearning models, including architecture and trained parameters (i.e., model weights);
(b)all associated preprocessing, training, inference, and finetuning code;
(c)training datasets and evaluation scripts (or their detailed descriptions and access mechanisms); and
(d)any accompanying documentation, metadata, and tools.
The above Model Materials shall be subject to the content published on the Licensors website or GitHub repository athttps://github.com/rednote-hilab/dots.mocr.
1.4“Outputs”shall meanany content generated through the use of the Model Materials, such as text, tables, code,layout information, and formulas extracted from documents.
1.5“MIT License”shall mean The MIT Open Source License published by the Massachusetts Institute of Technology.
1.6Priority of Agreement. In the event of any conflict or inconsistency between this Agreement and the MIT License, the terms of the MIT License shall prevail. However, if the terms of the MIT License are ambiguous or silent on a particular matter, the provisions of this Agreement shall apply and supplement the MIT License.
2.Grant of Rights and Scope of Use
Purpose: To grant broad, permissive rights to the Licensee for the Model Materials—including code, weights, data, and documentation—to ensure maximum openness and flexibility while clarifying the free use of model-generated content. Additionally, it clarifies the feasibility of transitioning from open-source to commercialuse and the use of OpenAPI interfaces.
2.1 Grant of Copyright License. Subject to Licensee's compliance with this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, nonexclusive, no-charge, royaltyfree copyright license to use (run or test), reproduce, modify, create derivative works of, merge, publish, distribute the Model Materials; sublicense and/or sell copies of the Model Materials or any derivative works thereof; and incorporate the unmodified or modified Model Materials into proprietary products or services, including for commercial purposes, softwareasaservice (SaaS) offerings, or via OpenAPI or other interfaces.
2.2Fundamental Capabilities. The Model Materials only provide the fundamental models capabilities. Licensees may develop derivative AI applications or undertake taskspecific training thereon.
2.3From Open Source to Commercial Use. The open-source release does not preclude Licensors commercial exploitation of the Model Materials, in whole or in part. Any such commercial use shall, at that time, be subject to license agreements between Licensor and applicable users.
2.4APIService Exception. Licensees who access the Model Materials through API calls or provide model services via API interfaces(without directly distributing model weights )shall not be subject to this Agreement unless otherwise expressly agreed. Instead, such use shall be governed by the API terms of use published by Licensor (if any).
3.Acceptable Use Policy and Prohibited Uses
3.1Responsible Use. Licensee must use the Model Materials in a responsible, ethical, and lawful manner, in compliance with all applicable laws, regulations, industry standards, and best practices.
3.2Enterprise OnPremises Deployment. The Licensee may deploy the Model Materials in closedsource, onpremises enterprise environments.
3.3Prohibited Uses. Any breach of the prohibitions below will result in the automatic termination of all licenses granted under this Agreement. Licensee agrees not to use the Model Materials or any derivative works thereof, in connection with:
(a) Identification and Utilization of Illegal/Harmful Content:Includes identifying graphic/text materials used for counterfeiting certificates/invoices, perpetrating fraud, or launching cyberattacks; or processing images containing illegal content such as violence, criminal activities, disinformation, or child exploitation.
(b) Privacy Infringement and Discriminatory Practices:Extracting personal sensitive information (e.g., ID numbers, medical records, biometric data) or protected characteristics (e.g., race, gender) from images without legal authorization or consent, for purposes of privacy violation, automated discriminatory decision-making, or harassment.
(c) Copyright Restrictions:Licensees shall not use the tool for unauthorized digitization of publications/document scanning or bulk scraping of content. Any use involving publications or other copyright-protected materials must first obtain relevant permissions.
4.Intellectual Property Ownership and Contributions
4.1Licensor's Copyright Reservation. Licensor reserves all right, title, and interest in and to the Model Materials (including the model architecture, parameters, code, and original training data), except as expressly licensed herein. The original copyright of the Model Materials belongs to the Licensor.
4.2Patent License. Subject to the terms and conditions of this Agreement, Licensor hereby grants Licensee a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model Materials, where such license applies only to those patent claims licensable by the Lisensor that are necessarily infringed by its contribution(s).
If Licensee institutes patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model Materials constitute direct or contributory patent infringement, then any patent licenses granted under this License for the Model Materials shall terminate as of the date such litigation is asserted or filed.
4.3Outputs: The Outputs generated through the use of the Model Materials generally refer to text, tables, layouts, and other content extracted from documents or images. The extracted content itself does not generate new intellectual property rights, and all intellectual property remains with the original authors or copyright holders. The Licensee is responsible for due diligence regarding the legality of the Outputs, particularly where the content extracted by the OCR model may be substantially similar to existing copyrighted works, which could present intellectual property infringement risks. The Licensor assumes no liability for such infringements.
4.4Trademarks. Nothing in this License permits Licensee to make use of Licensors trademarks, trade names, logos (e.g., “rednote,” “Xiaohongshu,” “dots.mocr”) or to otherwise suggest endorsement or misrepresent the relationship between the parties, unless Licensors prior written approval is granted.
5.Data Governance, Privacy, and Security
5.1Data Quality and Bias. Licensee shall use training data from lawful sources and is encouraged to conduct due diligence before deploying the Model Materials and to take reasonable steps to mitigate any known biases in its training data or applications.
5.2Privacy Protection.
(a)SensitiveData Restrictions. It is prohibited to use the Model Materials to process,or extract infer sensitive personal data protected under specific laws (such as GDPR or HIPAA), particularly when dealing with documents containing personally identifiable information (such as ID numbers, health data, financial information, etc.), unless Licensee has obtained all necessary consents, lawful basis, or authorizations, and has implemented adequate anonymization, pseudonymization, or other privacy-enhancing technologies.
(b)Data Minimization and Purpose Limitation. The Licensee shall follow the principle of data minimization when using the OCR Model, processing only the user data necessary for specific, explicit, and lawful purposes. Specifically, the OCR Model should avoid processing unnecessary sensitive data and ensure compliance with applicable privacy protection laws during data handling.
(c)Transparency. Licensee shall provide clear and transparent privacy policies and terms of use when processing user data, particularly during document scanning and information extraction. .
5.3Security Measures. Licensee shall implement appropriate technical and administrative safeguards to protect the Model Materials and any associated data against unauthorized access, disclosure, alteration, or destruction. Such measures may include, but are not limited to, encryption, access controls, logging, and audit trails.
5.4Further Training. Licensee may only use userprovided input or Outputs for training, fine-tuning, or improving other AI models if it has obtained the specific and informed consent of data subjects.
6.Disclaimer of Warranty and Limitation of Liability
6.1“ASIS” Basis. Unless required by applicable law, the Model Materials are provided on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. Licensee is solely responsible for determining the appropriateness of using or redistributing the Model Materials and assume any risks associated with the exercise of permissions under this License. Licensor does not provide any warranty of non-infringement but represents that no infringing code has been knowingly included.
6.2Outputs Disclaimer. As a neutral technology, Licensor disclaims all liability for the accuracy, completeness, reliability, safety, legality, or suitability of any Outputs. The Licensee is solely responsible for verifying the accuracy and appropriateness of AI-generated content and shall provide appropriate disclosures when publishing or relying upon such content.
6.3Limitation of Liability and Recourse. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall Licensor or contributors be liable for any claims, damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model Materials (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Licensor has been advised of the possibility of such damages. If such losses are incurred, recourse may be sought against the Licensee responsible for causing the loss.
6.4ContentFiltering Disclaimer. Although the Model Materials may include contentfiltering mechanisms, Licensor makes no warranties of any kind regarding the stability, quality, accuracy, completeness, or any specific outcome of Outputs. Licensee is solely responsible for reviewing, verifying, and performing quality control on Outputs and assumes all associated risks and liabilities.
7.Attribution and License Reservation
7.1License. When distributing or redistributing the Model Materials, Licensee must give any other recipients of the Model Materials a copy of this Agreement.
7.2Copyright and Notices. When distributing any part of the Model Materials, Licensee must retain all copyright, patent, trademark, and attribution notices included in the Model Materials.
7.3Attribution. Licensee is encouraged to prominently display the name of Licensor and the Model Materials in any public statements, products, or services that contain the Model Materials (or any derivative works thereof), to promote transparency and community trust. If Licensee distributes modified weights or finetuned models based on the Model Materials, Licensee must prominently display the following statement in the related website or documentation: “Built with dots.mocr.”
8.Governing Law and Dispute Resolution
8.1Governing Law. This Agreement shall be governed by and construed in accordance with the laws of the Peoples Republic of China, without regard to its conflict of laws principles.
8.2Dispute Resolution. Any dispute claim, or disagreement arising out of or relating to this Agreement shall first be resolved through amicable consultation. If such consultation fails, the dispute shall be submitted to the Hangzhou Arbitration Commission for arbitration. The arbitration shall be conducted in accordance with the laws of China, and the place of arbitration shall be [Hangzhou, China]. The arbitral award shall be final and binding upon both parties.
9.Regulatory Compliance Amendments
In the event that any part of this Agreement becomes invalid or requires adjustment due to changes in applicable laws or regulations, Licensor reserves the right to issue a revised version of this Agreement. Licensee shall migrate to the new version within [e.g., ninety (90)] days of its release; otherwise, all rights granted under this Agreement shall automatically terminate.
10.Security Reporting
Licensee discovering any security vulnerability in the Model Materials may report it to Licensor via: dots-feedback@xiaohongshu.com. Licensee shall not disclose vulnerability details until Licensor issues an official remediation, unless otherwise required by law.

8
generation_config.json Normal file
View File

@@ -0,0 +1,8 @@
{
"max_length": 32768,
"eos_token_id": [
151643,
151672,
151673
]
}

151387
merges.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d8f4bc75340279da003609fe93f2eb02cc1a77087f5dfb6ba46c0980e1b4da81
size 4998547840

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:0101b36fe6620ba135d4bb8efbbf275fc27b4363b10b7f632c058c0955f3dc4d
size 1079883896

View File

@@ -0,0 +1,651 @@
{
"metadata": {
"total_parameters": 3039179264,
"total_size": 6078358528
},
"weight_map": {
"lm_head.weight": "model-00002-of-00002.safetensors",
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.27.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.27.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.6.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.input_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
"model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
"model.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
"model.norm.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.1.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.11.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.12.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.13.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.13.mlp.fc3.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.15.attn.proj.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.15.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.15.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.2.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.20.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.21.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.21.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.22.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.24.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.25.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.26.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.26.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.26.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.26.mlp.fc3.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.26.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.26.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.27.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.28.attn.proj.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.28.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.28.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.28.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.28.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.28.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.28.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.29.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.29.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.29.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.29.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.29.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.29.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.29.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.3.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.3.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.30.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.31.attn.proj.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.31.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.31.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.31.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.31.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.31.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.31.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.32.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.33.attn.proj.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.33.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.33.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.33.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.33.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.33.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.33.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.34.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.34.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.34.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.34.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.34.mlp.fc3.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.34.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.34.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.35.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.35.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.35.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.35.mlp.fc2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.35.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.35.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.35.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.mlp.fc1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.36.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.36.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.37.attn.proj.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.37.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.37.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.37.mlp.fc2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.37.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.37.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.37.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.mlp.fc2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.38.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.38.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.39.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.39.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.39.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.39.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.39.mlp.fc3.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.39.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.39.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.4.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.40.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.40.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.41.norm2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.5.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.5.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.6.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.attn.qkv.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.8.mlp.fc2.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.8.mlp.fc3.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.mlp.fc3.weight": "model-00001-of-00002.safetensors",
"vision_tower.blocks.9.norm1.weight": "model-00002-of-00002.safetensors",
"vision_tower.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
"vision_tower.merger.ln_q.bias": "model-00001-of-00002.safetensors",
"vision_tower.merger.ln_q.weight": "model-00001-of-00002.safetensors",
"vision_tower.merger.mlp.0.bias": "model-00001-of-00002.safetensors",
"vision_tower.merger.mlp.0.weight": "model-00001-of-00002.safetensors",
"vision_tower.merger.mlp.2.bias": "model-00002-of-00002.safetensors",
"vision_tower.merger.mlp.2.weight": "model-00001-of-00002.safetensors",
"vision_tower.patch_embed.patchifier.norm.weight": "model-00001-of-00002.safetensors",
"vision_tower.patch_embed.patchifier.proj.bias": "model-00002-of-00002.safetensors",
"vision_tower.patch_embed.patchifier.proj.weight": "model-00001-of-00002.safetensors",
"vision_tower.post_trunk_norm.weight": "model-00001-of-00002.safetensors"
}
}

131
modeling_dots_ocr.py Normal file
View File

@@ -0,0 +1,131 @@
from typing import List, Optional, Tuple, Union
import torch
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.models.qwen2 import Qwen2ForCausalLM
from .configuration_dots import DotsVisionConfig, DotsOCRConfig
from .modeling_dots_vision import DotsVisionTransformer
DOTS_VLM_MAX_IMAGES = 200
class DotsOCRForCausalLM(Qwen2ForCausalLM):
config_class = DotsOCRConfig
def __init__(self, config: DotsOCRConfig):
super().__init__(config)
if isinstance(self.config.vision_config, dict):
vision_config = DotsVisionConfig(**self.config.vision_config)
self.config.vision_config = vision_config
else:
vision_config = self.config.vision_config
self.vision_tower = DotsVisionTransformer(vision_config)
def prepare_inputs_embeds(
self,
input_ids: torch.LongTensor,
pixel_values: Optional[torch.FloatTensor] = None,
grid_thw: Optional[torch.FloatTensor] = None,
img_mask: Optional[torch.BoolTensor] = None,
) -> torch.Tensor:
inputs_embeds = self.get_input_embeddings()(input_ids)
if pixel_values is not None:
assert img_mask is not None
if grid_thw.shape[0] > DOTS_VLM_MAX_IMAGES:
print(
f"Num image exceeded: {grid_thw.shape[0]} > {DOTS_VLM_MAX_IMAGES}, which may cause FSDP hang"
)
vision_embeddings = self.vision_tower(pixel_values, grid_thw)
true_indices = torch.nonzero(img_mask).squeeze()
if len(true_indices) > vision_embeddings.size(0):
print(
f"img_mask sum > VE and will be truncated, mask.sum()={len(true_indices)} {vision_embeddings.size(0)=}"
)
true_indices = true_indices[: vision_embeddings.size(0)]
new_img_mask = torch.zeros_like(img_mask, device=img_mask.device)
new_img_mask[true_indices[:, 0], true_indices[:, 1]] = True
else:
new_img_mask = img_mask
assert (
vision_embeddings.size(0) == new_img_mask.sum()
), f"{vision_embeddings.size(0)=}, {new_img_mask.sum()=}"
inputs_embeds = inputs_embeds.masked_scatter(
new_img_mask.to(inputs_embeds.device).unsqueeze(-1).expand_as(inputs_embeds),
vision_embeddings.to(inputs_embeds.device).type(inputs_embeds.dtype),
)
return inputs_embeds
def forward(
self,
input_ids: torch.LongTensor,
pixel_values: Optional[torch.FloatTensor] = None,
image_grid_thw: Optional[torch.FloatTensor] = None,
inputs_embeds: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
labels: Optional[torch.LongTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
use_cache: Optional[bool] = None,
logits_to_keep: int = 0,
**loss_kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
assert len(input_ids) >= 1, f"empty input_ids {input_ids.shape=} will cause gradnorm nan"
if inputs_embeds is None:
img_mask = input_ids == self.config.image_token_id
inputs_embeds = self.prepare_inputs_embeds(input_ids, pixel_values, image_grid_thw, img_mask)
outputs = super().forward(
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
labels=labels,
use_cache=use_cache if use_cache is not None else self.config.use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
# return_dict=return_dict,
logits_to_keep=logits_to_keep,
**loss_kwargs,
)
return outputs
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
inputs_embeds=None,
pixel_values=None,
attention_mask=None,
cache_position=None,
num_logits_to_keep=None,
**kwargs,
):
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
attention_mask=attention_mask,
cache_position=cache_position,
num_logits_to_keep=num_logits_to_keep,
**kwargs,
)
if cache_position[0] == 0:
model_inputs["pixel_values"] = pixel_values
return model_inputs

404
modeling_dots_vision.py Normal file
View File

@@ -0,0 +1,404 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from flash_attn import flash_attn_varlen_func
from torch.nn import LayerNorm
from transformers.modeling_utils import PreTrainedModel
from .configuration_dots import DotsVisionConfig
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
orig_dtype = tensor.dtype
tensor = tensor.float()
cos = freqs.cos()
sin = freqs.sin()
cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
output = (tensor * cos) + (rotate_half(tensor) * sin)
output = output.to(orig_dtype)
return output
class VisionRotaryEmbedding(nn.Module):
def __init__(self, dim: int, theta: float = 10000.0) -> None:
super().__init__()
inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
def forward(self, seqlen: int) -> torch.Tensor:
seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
freqs = torch.outer(seq, self.inv_freq)
return freqs
class PatchMerger(nn.Module):
def __init__(
self,
dim: int,
context_dim: int,
spatial_merge_size: int = 2,
pre_norm="layernorm",
init_merger_std=None,
) -> None:
super().__init__()
self.hidden_size = context_dim * (spatial_merge_size ** 2)
self.pre_norm = pre_norm
if self.pre_norm == "layernorm":
self.ln_q = LayerNorm(context_dim, eps=1e-6)
elif self.pre_norm == "rmsnorm":
self.ln_q = RMSNorm(context_dim, eps=1e-6)
else:
print("no norm in patch merger")
self.mlp = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size),
nn.GELU(),
nn.Linear(self.hidden_size, dim),
)
if init_merger_std is not None:
nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
nn.init.zeros_(self.mlp[0].bias)
nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
nn.init.zeros_(self.mlp[2].bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.pre_norm:
x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
else:
x = self.mlp(x.view(-1, self.hidden_size))
return x
class VisionAttention(nn.Module):
def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.qkv = nn.Linear(dim, dim * 3, bias=bias)
self.proj = nn.Linear(dim, dim, bias=bias)
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor = None,
) -> torch.Tensor:
seq_length = hidden_states.shape[0]
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
attention_mask = torch.full(
[1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
)
for i in range(1, len(cu_seqlens)):
attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
q = q.transpose(0, 1)
k = k.transpose(0, 1)
v = v.transpose(0, 1)
attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
attn_weights = attn_weights + attention_mask
attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
attn_output = torch.matmul(attn_weights, v)
attn_output = attn_output.transpose(0, 1)
attn_output = attn_output.reshape(seq_length, -1)
attn_output = self.proj(attn_output)
return attn_output
class VisionFlashAttention2(nn.Module):
def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
super().__init__()
self.num_heads = num_heads
self.qkv = nn.Linear(dim, dim * 3, bias=bias)
self.proj = nn.Linear(dim, dim, bias=bias)
self.config = config
self.is_causal = config.is_causal
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor = None,
) -> torch.Tensor:
seq_length = hidden_states.shape[0]
q, k, v = (
self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
) # 'shd'
q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
attn_output = flash_attn_varlen_func(
q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen, causal=self.is_causal
).reshape(seq_length, -1)
attn_output = self.proj(attn_output)
return attn_output
class VisionSdpaAttention(nn.Module):
def __init__(self, config, dim: int, num_heads: int = 16, bias=True) -> None:
super().__init__()
self.num_heads = num_heads
self.qkv = nn.Linear(dim, dim * 3, bias=bias)
self.proj = nn.Linear(dim, dim, bias=bias)
self.config = config
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
rotary_pos_emb: torch.Tensor = None,
) -> torch.Tensor:
seq_length = hidden_states.shape[0]
q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
for i in range(1, len(cu_seqlens)):
attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
q = q.transpose(0, 1)
k = k.transpose(0, 1)
v = v.transpose(0, 1)
attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
attn_output = attn_output.transpose(0, 1)
attn_output = attn_output.reshape(seq_length, -1)
attn_output = self.proj(attn_output)
return attn_output
DOTS_VISION_ATTENTION_CLASSES = {
"eager": VisionAttention,
"flash_attention_2": VisionFlashAttention2,
"sdpa": VisionSdpaAttention,
}
class RMSNorm(nn.Module):
def __init__(self, dim: int, eps: float = 1e-6):
super().__init__()
self.weight = nn.Parameter(torch.ones(dim))
self.eps = eps
def forward(self, x: torch.Tensor) -> torch.Tensor:
output = self._norm(x.float()).type_as(x)
return output * self.weight
def extra_repr(self) -> str:
return f"{tuple(self.weight.shape)}, eps={self.eps}"
def _norm(self, x: torch.Tensor) -> torch.Tensor:
return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
class DotsSwiGLUFFN(nn.Module):
def __init__(self, config):
super().__init__()
hidden_features = config.intermediate_size
in_features = config.embed_dim
bias = config.use_bias
self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = F.silu(self.fc1(x)) * self.fc3(x)
x = self.fc2(x)
return x
class DotsPatchEmbed(nn.Module):
def __init__(self, config):
super().__init__()
self.num_channels = config.num_channels
self.patch_size = config.patch_size
self.temporal_patch_size = config.temporal_patch_size
self.embed_dim = config.embed_dim
self.config = config
self.proj = nn.Conv2d(
config.num_channels,
config.embed_dim,
kernel_size=(config.patch_size, config.patch_size),
stride=(config.patch_size, config.patch_size),
)
self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
x = x.view(-1, self.num_channels, self.temporal_patch_size, self.patch_size, self.patch_size)[:, :, 0]
x = self.proj(x).view(-1, self.embed_dim)
x = self.norm(x)
return x
class DotsViTPreprocessor(nn.Module):
def __init__(self, config):
super().__init__()
self.patch_h = config.patch_size
self.patch_w = config.patch_size
self.embed_dim = config.embed_dim
self.config = config
self.patchifier = DotsPatchEmbed(config)
def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
tokens = self.patchifier(x, grid_thw)
return tokens
class DotsVisionBlock(nn.Module):
def __init__(self, config, attn_implementation: str = "flash_attention_2"):
super().__init__()
self.attn = DOTS_VISION_ATTENTION_CLASSES[attn_implementation](
config, config.embed_dim, num_heads=config.num_attention_heads, bias=config.use_bias
)
self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
self.mlp = DotsSwiGLUFFN(config)
self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
)
hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
return hidden_states
class DotsVisionTransformer(PreTrainedModel):
def __init__(self, config: DotsVisionConfig) -> None:
super().__init__(config)
self.config = config
self.spatial_merge_size = config.spatial_merge_size
self.patch_embed = DotsViTPreprocessor(config)
self._init_weights(self.patch_embed.patchifier.proj)
head_dim = config.embed_dim // config.num_attention_heads
self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
_num_hidden_layers = config.num_hidden_layers
self.blocks = nn.ModuleList(
[DotsVisionBlock(config, config.attn_implementation) for _ in range(_num_hidden_layers)]
)
if self.config.post_norm:
self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
self.merger = PatchMerger(
dim=config.hidden_size,
context_dim=config.embed_dim,
spatial_merge_size=config.spatial_merge_size,
init_merger_std=self.config.init_merger_std,
)
self.gradient_checkpointing = False
self._gradient_checkpointing_func = torch.utils.checkpoint.checkpoint
def _init_weights(self, module):
std = self.config.initializer_range
if isinstance(module, (nn.Linear, nn.Conv3d)):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dtype(self) -> torch.dtype:
return self.blocks[0].mlp.fc2.weight.dtype
@property
def device(self) -> torch.device:
return self.blocks[0].mlp.fc2.weight.device
def get_pos_ids_by_grid(self, grid_thw):
pos_ids = []
for t, h, w in grid_thw:
hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
hpos_ids = hpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
hpos_ids = hpos_ids.permute(0, 2, 1, 3)
hpos_ids = hpos_ids.flatten()
wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
wpos_ids = wpos_ids.reshape(
h // self.spatial_merge_size,
self.spatial_merge_size,
w // self.spatial_merge_size,
self.spatial_merge_size,
)
wpos_ids = wpos_ids.permute(0, 2, 1, 3)
wpos_ids = wpos_ids.flatten()
pos_ids.append(
torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
)
return pos_ids
def rot_pos_emb(self, grid_thw):
pos_ids = self.get_pos_ids_by_grid(grid_thw)
pos_ids = torch.cat(pos_ids, dim=0)
max_grid_size = grid_thw[:, 1:].max()
rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
return rotary_pos_emb
def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True) -> torch.Tensor:
if bf16:
hidden_states = hidden_states.bfloat16()
hidden_states = self.patch_embed(hidden_states, grid_thw)
rotary_pos_emb = self.rot_pos_emb(grid_thw)
cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
dim=0,
dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
)
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
for blk in self.blocks:
if self.gradient_checkpointing and self.training:
hidden_states = self._gradient_checkpointing_func(
blk.__call__,
hidden_states,
cu_seqlens,
rotary_pos_emb,
)
else:
hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
if self.config.post_norm:
hidden_states = self.post_trunk_norm(hidden_states)
hidden_states = self.merger(hidden_states)
return hidden_states

22
preprocessor_config.json Normal file
View File

@@ -0,0 +1,22 @@
{
"auto_map": {
"AutoProcessor": "configuration_dots.DotsVLProcessor"
},
"min_pixels": 3136,
"max_pixels": 11289600,
"patch_size": 14,
"temporal_patch_size": 1,
"merge_size": 2,
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"image_processor_type": "Qwen2VLImageProcessor",
"processor_class": "DotsVLProcessor"
}

25
special_tokens_map.json Normal file
View File

@@ -0,0 +1,25 @@
{
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"eos_token": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": "[PAD]"
}

303490
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

391
tokenizer_config.json Normal file
View File

@@ -0,0 +1,391 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151665": {
"content": "<|imgpad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151666": {
"content": "<|img|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151667": {
"content": "<|endofimg|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151668": {
"content": "<|systemprompt|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151669": {
"content": "<|endofsystemprompt|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151670": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151671": {
"content": "<|endofuser|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151672": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151673": {
"content": "<|endofassistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151674": {
"content": "<|ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151675": {
"content": "<|ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151676": {
"content": "[SEP]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151677": {
"content": "<|pic|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151678": {
"content": "<|text|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151679": {
"content": "<|pictotext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151680": {
"content": "[PAD]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151681": {
"content": "<|slice|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151682": {
"content": "<|endofslice|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151683": {
"content": "<|imgrowend|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151684": {
"content": "<|polygon_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151685": {
"content": "<|polygon_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151686": {
"content": "<|image_gen_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151687": {
"content": "<|image_gen_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"bos_token": null,
"chat_template": "{%- for m in messages %}\n {%- if m.role == 'system' %}\n {{- '<|system|>' + m.content + '<|endofsystem|>\\n' }}\n {%- elif m.role == 'user' %}\n {{- '<|user|>' + m.content + '<|endofuser|>' }}\n {%- elif m.role == 'assistant' %}\n {{- '<|assistant|>' + m.content }}\n {%- if not loop.last %}\n {{- '<|endofassistant|>' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if messages[-1].role != 'assistant' %}\n {{- '<|assistant|>' }}\n{%- endif %}",
"clean_up_tokenization_spaces": false,
"eos_token": "<|endoftext|>",
"errors": "replace",
"model_max_length": 131072,
"pad_token": "[PAD]",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

1
vocab.json Normal file

File diff suppressed because one or more lines are too long