初始化项目，由ModelHub XC社区提供模型

Model: apple/SimpleSD-4B-thinking Source: Original Platform
2026-05-09 14:17:30 +08:00
commit 88bfdba243
16 changed files with 152465 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,36 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/88
+++ b/88
@@ -0,0 +1,88 @@
 Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is
 specifically developed and released by Apple Inc. ("Apple") for the sole purpose
 of scientific research of artificial intelligence and machine-learning
 technology. “Apple Machine Learning Research Model” means the model, including
 but not limited to algorithms, formulas, trained model weights, parameters,
 configurations, checkpoints, and any related materials (including
 documentation).
 This Apple Machine Learning Research Model is provided to You by
 Apple in consideration of your agreement to the following terms, and your use,
 modification, creation of Model Derivatives, and or redistribution of the Apple
 Machine Learning Research Model constitutes acceptance of this Agreement. If You
 do not agree with these terms, please do not use, modify, create Model
 Derivatives of, or distribute this Apple Machine Learning Research Model or
 Model Derivatives.
 * License Scope: In consideration of your agreement to abide by the following
  terms, and subject to these terms, Apple hereby grants you a personal,
  non-exclusive, worldwide, non-transferable, royalty-free, revocable, and
  limited license, to use, copy, modify, distribute, and create Model
  Derivatives (defined below) of the Apple Machine Learning Research Model
  exclusively for Research Purposes. You agree that any Model Derivatives You
  may create or that may be created for You will be limited to Research Purposes
  as well. “Research Purposes” means non-commercial scientific research and
  academic development activities, such as experimentation, analysis, testing
  conducted by You with the sole intent to advance scientific knowledge and
  research. “Research Purposes” does not include any commercial exploitation,
  product development or use in any commercial product or service.
 * Distribution of Apple Machine Learning Research Model and Model Derivatives:
  If you choose to redistribute Apple Machine Learning Research Model or its
  Model Derivatives, you must provide a copy of this Agreement to such third
  party, and ensure that the following attribution notice be provided: “Apple
  Machine Learning Research Model is licensed under the Apple Machine Learning
  Research Model License Agreement.” Additionally, all Model Derivatives must
  clearly be identified as such, including disclosure of modifications and
  changes made to the Apple Machine Learning Research Model. The name,
  trademarks, service marks or logos of Apple may not be used to endorse or
  promote Model Derivatives or the relationship between You and Apple. “Model
  Derivatives” means any models or any other artifacts created by modifications,
  improvements, adaptations, alterations to the architecture, algorithm or
  training processes of the Apple Machine Learning Research Model, or by any
  retraining, fine-tuning of the Apple Machine Learning Research Model.
 * No Other License: Except as expressly stated in this notice, no other rights
  or licenses, express or implied, are granted by Apple herein, including but
  not limited to any patent, trademark, and similar intellectual property rights
  worldwide that may be infringed by the Apple Machine Learning Research Model,
  the Model Derivatives or by other works in which the Apple Machine Learning
  Research Model may be incorporated.
 * Compliance with Laws: Your use of Apple Machine Learning Research Model must
  be in compliance with all applicable laws and regulations.
 * Term and Termination: The term of this Agreement will begin upon your
  acceptance of this Agreement or use of the Apple Machine Learning Research
  Model and will continue until terminated in accordance with the following
  terms. Apple may terminate this Agreement at any time if You are in breach of
  any term or condition of this Agreement. Upon termination of this Agreement,
  You must cease to use all Apple Machine Learning Research Models and Model
  Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
  survive termination.
 * Disclaimer and Limitation of Liability: This Apple Machine Learning Research
  Model and any outputs generated by the Apple Machine Learning Research Model
  are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, EXPRESS OR
  IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
  NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE,
  REGARDING THE APPLE MACHINE LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY
  THE APPLE MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
  determining the appropriateness of using or redistributing the Apple Machine
  Learning Research Model and any outputs of the Apple Machine Learning Research
  Model and assume any risks associated with Your use of the Apple Machine
  Learning Research Model and any output and results. IN NO EVENT SHALL APPLE BE
  LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
  IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF
  THE APPLE MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE MACHINE
  LEARNING RESEARCH MODEL, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT,
  TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS
  BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Governing Law: This Agreement will be governed by and construed under the laws
  of the State of California without regard to its choice of law principles. The
  Convention on Contracts for the International Sale of Goods shall not apply to
  the Agreement except that the arbitration clause and any arbitration hereunder
  shall be governed by the Federal Arbitration Act, Chapters 1 and 2. 
 Copyright (C) 2026 Apple Inc. All Rights Reserved.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,69 @@
 ---
 license: apple-amlr
 base_model:
 - Qwen/Qwen3-4B-Thinking-2507
 tags:
 - self-distillation
 - code-generation
 library_name: transformers
 ---
 # SimpleSD-4B-thinking
 This model is an example of the **Simple Self-Distillation (SimpleSD)** method that improves code generation by fine-tuning a language model on its own sampled outputs—without rewards, verifiers, teacher models, or reinforcement learning. Please see the paper below for more information. This uses Qwen for initialization.
 - **Self-distillation sampling:** temperature=1.1, top_p=0.95, top_k=20
 - **Evaluation sampling:** temperature=0.7, top_p=0.95, top_k=20
 paper: https://arxiv.org/abs/2604.01193
 code: https://github.com/apple/ml-ssd
 ## Notes
 - These are research checkpoints for reproducibility.
 - They are not optimized Qwen releases.
 - They don't represent a broader open-source model strategy.
 ## Usage
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("apple/SimpleSD-4B-thinking")
 tokenizer = AutoTokenizer.from_pretrained("apple/SimpleSD-4B-thinking")
 ```
 ## Method
 SimpleSD samples solutions from the base model using non-unit temperature and top-k/top-p truncation, then fine-tunes on those samples via standard supervised learning. Despite its simplicity, SimpleSD yields large gains on competitive programming benchmarks, with improvements concentrating on harder problems. The mechanism traces to resolving a *precision–exploration conflict*: SimpleSD reshapes token distributions in a context-dependent way so that a single global decoding configuration becomes far more effective at evaluation time.
 ## Results
 LiveCodeBench (%)
 | Model | LCBv6 pass@1 | LCBv6 pass@5 | LCBv5 pass@1 | LCBv5 pass@5 |
 |---|---|---|---|---|
 | Qwen3-4B-Thinking-2507 (base) | 54.5 | 67.5 | 59.6 | 70.3 |
 | **+ SimpleSD (this model)** | **57.8** (+3.3) | **71.4** (+3.9) | **63.1** (+3.5) | **74.7** (+4.4) |
 ## Paper
 [**Embarrassingly Simple Self-Distillation Improves Code Generation**](https://arxiv.org/abs/2604.01193)
 ```bibtex
@misc{zhang2026embarrassinglysimpleselfdistillationimproves,
      title={Embarrassingly Simple Self-Distillation Improves Code Generation},
      author={Ruixiang Zhang and Richard He Bai and Huangjie Zheng and Navdeep Jaitly and Ronan Collobert and Yizhe Zhang},
      year={2026},
      eprint={2604.01193},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2604.01193},
 }
 ```
 ## License
 This model is released under the [Apple Machine Learning Research Model License](https://huggingface.co/apple/SimpleSD-4B-thinking/blob/main/LICENSE).
--- a/added_tokens.json
+++ b/added_tokens.json
@@ -0,0 +1,28 @@
 {
  "</think>": 151668,
  "</tool_call>": 151658,
  "</tool_response>": 151666,
  "<think>": 151667,
  "<tool_call>": 151657,
  "<tool_response>": 151665,
  "<|box_end|>": 151649,
  "<|box_start|>": 151648,
  "<|endoftext|>": 151643,
  "<|file_sep|>": 151664,
  "<|fim_middle|>": 151660,
  "<|fim_pad|>": 151662,
  "<|fim_prefix|>": 151659,
  "<|fim_suffix|>": 151661,
  "<|im_end|>": 151645,
  "<|im_start|>": 151644,
  "<|image_pad|>": 151655,
  "<|object_ref_end|>": 151647,
  "<|object_ref_start|>": 151646,
  "<|quad_end|>": 151651,
  "<|quad_start|>": 151650,
  "<|repo_name|>": 151663,
  "<|video_pad|>": 151656,
  "<|vision_end|>": 151653,
  "<|vision_pad|>": 151654,
  "<|vision_start|>": 151652
 }
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,86 @@
 {%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
 {%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
 {%- endif %}
 {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
 {%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
 {%- endfor %}
 {%- for message in messages %}
    {%- if message.content is string %}
        {%- set content = message.content %}
    {%- else %}
        {%- set content = '' %}
    {%- endif %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is string %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.last or (not loop.last and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n<think>\n' }}
 {%- endif %}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,68 @@
 {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "dtype": "bfloat16",
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 9728,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention"
  ],
  "max_position_embeddings": 262144,
  "max_window_layers": 36,
  "model_type": "qwen3",
  "num_attention_heads": 32,
  "num_hidden_layers": 36,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 5000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "transformers_version": "4.57.3",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
 }
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,14 @@
 {
  "bos_token_id": 151643,
  "do_sample": true,
  "eos_token_id": [
    151645,
    151643
  ],
  "pad_token_id": 151643,
  "temperature": 0.6,
  "top_k": 20,
  "top_p": 0.95,
  "transformers_version": "4.57.3",
  "trust_remote_code": true
 }
--- a/merges.txt
+++ b/merges.txt
--- a/model-00001-of-00003.safetensors
+++ b/model-00001-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:cac93bed40ddc915fdf13e17a500f983d3e5fd1c45961a31b03709a764db03bb
 size 3957900808
--- a/model-00002-of-00003.safetensors
+++ b/model-00002-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:1179ce4abee5d938fbca4a6e67c542b342bab22f59456fdc042123f74da0830c
 size 3987450496
--- a/model-00003-of-00003.safetensors
+++ b/model-00003-of-00003.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:e88ee7e6188fe7bc9ab5b88aecd946bb03033c4861d2471d92bff8994036fce9
 size 99630608
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
@@ -0,0 +1,405 @@
 {
    "metadata": {
        "total_size": 8045591552
    },
    "weight_map": {
        "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.31.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
        "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.34.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.20.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.norm.weight": "model-00003-of-00003.safetensors",
        "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.19.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
        "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.29.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.30.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
        "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
        "model.layers.34.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.16.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.33.input_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
        "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
        "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.32.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.20.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
        "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00003.safetensors",
        "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00003.safetensors",
        "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
        "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
        "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors"
    }
 }
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,31 @@
 {
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|endoftext|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,239 @@
 {
  "add_bos_token": false,
  "add_prefix_space": false,
  "added_tokens_decoder": {
    "151643": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151644": {
      "content": "<|im_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151645": {
      "content": "<|im_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151646": {
      "content": "<|object_ref_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151647": {
      "content": "<|object_ref_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151648": {
      "content": "<|box_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151649": {
      "content": "<|box_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151650": {
      "content": "<|quad_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151651": {
      "content": "<|quad_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151652": {
      "content": "<|vision_start|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151653": {
      "content": "<|vision_end|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151654": {
      "content": "<|vision_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151655": {
      "content": "<|image_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151656": {
      "content": "<|video_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151657": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151658": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151659": {
      "content": "<|fim_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151660": {
      "content": "<|fim_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151661": {
      "content": "<|fim_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151662": {
      "content": "<|fim_pad|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151663": {
      "content": "<|repo_name|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151664": {
      "content": "<|file_sep|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151665": {
      "content": "<tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151666": {
      "content": "</tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151667": {
      "content": "<think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151668": {
      "content": "</think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "additional_special_tokens": [
    "<|im_start|>",
    "<|im_end|>",
    "<|object_ref_start|>",
    "<|object_ref_end|>",
    "<|box_start|>",
    "<|box_end|>",
    "<|quad_start|>",
    "<|quad_end|>",
    "<|vision_start|>",
    "<|vision_end|>",
    "<|vision_pad|>",
    "<|image_pad|>",
    "<|video_pad|>"
  ],
  "bos_token": null,
  "clean_up_tokenization_spaces": false,
  "eos_token": "<|im_end|>",
  "errors": "replace",
  "extra_special_tokens": {},
  "model_max_length": 262144,
  "pad_token": "<|endoftext|>",
  "split_special_tokens": false,
  "tokenizer_class": "Qwen2Tokenizer",
  "unk_token": null
 }
--- a/vocab.json
+++ b/vocab.json