初始化项目，由ModelHub XC社区提供模型

Model: Godcat252/Besttop977 Source: Original Platform
2026-05-23 21:14:22 +08:00
commit d36868b650
20 changed files with 8552 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,37 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.npy filter=lfs diff=lfs merge=lfs -text
 *.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 preview-banner.png filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/190
+++ b/190
@@ -0,0 +1,190 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding any notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   Copyright 2026 Trillion Labs
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,211 @@
 ---
 license: apache-2.0
 language:
 - en
 - ko
 library_name: transformers
 tags:
 - moe
 - mixture-of-experts
 - gravity
 - trillion-labs
 - chat
 - post-trained
 - preview
 pipeline_tag: text-generation
 base_model:
 - trillionlabs/Gravity-16B-A3B-Base
 ---
 <p align="center">
  <img src="preview-banner.png" alt="Gravity-16B-A3B-Preview" width="100%">
 </p>
 # Gravity-16B-A3B-Preview
 **Gravity-16B-A3B-Preview** is a post-trained language model built on [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) by [Trillion Labs](https://trillionlabs.co). Starting from the base model, it underwent context length extension (32K → 128K), supervised fine-tuning (SFT), and reinforcement learning (GRPO) focused on science and code.
 This is a preview release offering a strong balance of capability, efficiency, and long-context support for its size. We are actively working on agentic capabilities for the full release.
 ## Model Summary
 | Property | Value |
 |---|---|
 | **Base Model** | [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) |
 | **Total Parameters** | 16.24B |
 | **Active Parameters** | 3.16B |
 | **Architecture** | GravityMoE |
 | **Context Length** | 131,072 tokens (128K) |
 | **Precision** | bf16 |
 | **License** | Apache 2.0 |
 For full architectural details (MLA, MoE routing, tokenizer, etc.), see the [base model card](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base).
 ## Post-Training Pipeline
 Starting from [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) (pretrained on ~5.5T tokens):
 1. **Context Length Extension** — Extended from 32K to 128K tokens.
 2. **Supervised Fine-Tuning (SFT)** — Instruction tuning for general chat and task-following capabilities.
 3. **Reinforcement Learning (GRPO)** — Single-step Group Relative Policy Optimization focused on science and code domains.
 Agentic RL and multi-turn RL stages are in progress and will be included in future releases.
 ## Evaluation Results
 | Category | Benchmark | Metric | Score |
 |---|---|---|---|
 | **Math** | AIME 2024 | acc | 43.3 |
 | | GSM8K | acc | 91.8 |
 | | MATH500 | acc | 88.6 |
 | **Code** | HumanEval | pass@1 | 89.0 |
 | | MBPP | pass@1 | 96.0 |
 | | LiveCodeBench V6 | pass@1 | 41.0 |
 | **Knowledge** | MMLU | acc | 80.1 |
 | | MMLU-Pro | acc | 71.5 |
 | | BBH | acc | 79.24 |
 | **Science** | GPQA Diamond | acc | 55.1 |
 | | Arc Challenge | acc | 92.32 |
 | | ChemBench | acc | 68.6 |
 | | Molang Bench (Editing) | SMILEs validty / Tanimoto similarity / Accuracy | 70.83 / 86.43 / 43.23 |
 | | Molang Bench (Generation) | SMILEs validty / Tanimoto similarity / Accuracy | 35.96 / 43.24 / 1.69 |
 | **Instruction Following** | IFEval | instruct level loose | 84.53 |
 | | IFBench | instruct level loose | 46.51 |
 | **Agentic** | Tau^2 (Telecom) | pass@1 | 71.93 |
 | | Scicode | sub problem level | 18.8 |
 | | Terminal Bench | pass@1 | 21.25 |
 | **Long Context** | AA-LCR | pass@1 | 21.0 |
 ### Comparison with Moonlight-16B-A3B-Instruct
 | Category | Benchmark | Metric | Gravity-16B-A3B-Preview | Moonlight-16B-A3B-Instruct |
 |---|---|---|---|---|
 | **Math** | GSM8K | acc | 91.8 | 77.4 |
 | **Code** | HumanEval | pass@1 | 89.0 | 48.1 |
 | | MBPP | pass@1 | 96.0 | 63.8 |
 | **Knowledge** | MMLU | acc | 80.1 | 70.0 |
 | | MMLU-Pro | acc | 71.5 | 42.4 |
 | | BBH | acc | 79.24 | 65.2 |
 > Note: We include Moonlight-16B-A3B-Instruct for comparison since it is similar in size to our model. Moonlight-16B-A3B-Instruct scores are taken from the numbers reported in their own technical report.
 With 3.16B active parameters, 128K context, and broad coverage across math, code, and knowledge benchmarks, the model offers a strong balance of capability and efficiency for its size.
 Agentic benchmarks (multi-step tool use, code execution) are not yet a focus of this release. We are actively training on agentic tasks and will include those results in the next release.
 ## Quickstart
 ### Installation
 ```bash
 pip install "transformers>=5.0" torch
 ```
 ### Using Transformers
 ```python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 model_name = "trillionlabs/Gravity-16B-A3B-Preview"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
 )
 messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Solve the equation: x^3 - 6x^2 + 11x - 6 = 0"},
 ]
 input_ids = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, return_tensors="pt"
 ).to(model.device)
 output = model.generate(input_ids, max_new_tokens=1024, do_sample=True, temperature=0.7)
 print(tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True))
 ```
 ## Deployment
 > **Note:** We are working on upstreaming native GravityMoE support to [SGLang](https://github.com/sgl-project/sglang). Until the PR is merged, please use the installation steps below.
 ### SGLang
 Install SGLang from the [sglang-gravity](https://github.com/trillion-labs/sglang-gravity) fork:
 ```bash
 pip install "sglang[all] @ git+https://github.com/trillion-labs/sglang-gravity.git#subdirectory=python"
 ```
 Launch the server:
 ```bash
 python3 -m sglang.launch_server \
    --model-path trillionlabs/Gravity-16B-A3B-Preview \
    --host 0.0.0.0 \
    --port 30000 \
    --tp 8 \
    --trust-remote-code \
    --moe-runner-backend triton \
    --tool-call-parser glm45 \
    --reasoning-parser glm45 \
    --dtype bfloat16
 ```
 Send a request:
 ```bash
 curl http://localhost:30000/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "trillionlabs/Gravity-16B-A3B-Preview",
    "messages": [{"role": "user", "content": "What is the capital of South Korea?"}],
    "max_tokens": 128,
    "temperature": 0.7
  }'
 ```
 ## Limitations
 - This is a preview release. Agentic and multi-turn capabilities are under active development.
 - The model may generate factually incorrect, biased, or harmful content.
 - Performance may degrade on languages not well-represented in the training data.
 ## Acknowledgements
 This model was developed as part of a collaborative research initiative led by **Lunit** and **Trillion Labs**, with a focus on advancing foundation models for science and healthcare.
 - **Lunit** — Project lead and medical AI research
 - **Trillion Labs** — Model architecture, pretraining, and infrastructure
 - **Aigen Science** — Biomedical AI and drug discovery research
 - **SK Biopharmaceuticals** — AI-driven drug development and digital healthcare advisory
 - **Kakao Healthcare** — Medical data standardization and platform support
 We also thank the following participating institutions for their contributions: KAIST (Yoonjae Choi, Taekyun Kim, Jong Chul Ye, Hyunwoo Kim, Seunghoon Hong), Seoul National University (Yousung Jung), Rebellions, Standigm, NHIS Ilsan Hospital, Yongin Severance Hospital, Gangdong Kyung Hee University Hospital, Kyung Hee University Medical Center, Korea University, Konyang University Hospital, Ewha Womans University Seoul Hospital, Keimyung University Dongsan Medical Center, Pusan National University Yangsan Hospital, and D-Circle.
 This work was supported by the **AI Specialized Foundation Model Project** (인공지능 특화 파운데이션 모델 프로젝트), funded by the **Ministry of Science and ICT** (과학기술정보통신부, MSIT) and managed by the **National IT Industry Promotion Agency** (NIPA, 정보통신산업진흥원).
 ## License
 This model is released under the [Apache 2.0 License](LICENSE).
 ## Citation
 ```bibtex
@misc{gravity-preview-2026,
    title={Gravity-16B-A3B-Preview},
    author={Trillion Labs},
    year={2026},
    url={https://huggingface.co/trillionlabs/Gravity-16B-A3B-Preview}
 }
 ```
 ## Contact
 - Website: [trillionlabs.co](https://trillionlabs.co)
 - Website: [lunit.io](https://www.lunit.io)
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,112 @@
 {%- macro render_content(msg) -%}
    {%- set c = msg.get('content') -%}
    {%- if c is string -%}
      {{ c }}
    {%- elif c is not none -%}
      {% for content in c -%}
        {% if content['type'] == 'image' or content['type'] == 'image_url' -%}
          <|media_begin|>image<|media_content|><|media_pad|><|media_end|>
        {% elif content['type'] == 'video' or content['type']== 'video_url'-%}
          <|kimi_k25_video_placeholder|>
        {% else -%}
          {{ content['text'] }}
        {%- endif -%}
      {%- endfor -%}
    {%- endif -%}
 {%- endmacro -%}
 {% macro set_roles(message) -%}
  {%- set role_name =  message.get('name') or  message['role'] -%}
  {%- if message['role'] == 'user' -%}
    <|im_user|>{{role_name}}<|im_middle|>
  {%- elif message['role'] == 'assistant' -%}
    <|im_assistant|>{{role_name}}<|im_middle|>
  {%- else -%}
    <|im_system|>{{role_name}}<|im_middle|>
  {%- endif -%}
 {%- endmacro -%}
 {%- macro render_toolcalls(message) -%}
  <|tool_calls_section_begin|>
  {%- for tool_call in message['tool_calls'] -%}
    {%- set formatted_id = tool_call['id'] -%}
    <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
  {%- endfor -%}
  <|tool_calls_section_end|>
 {%- endmacro -%}
 {%- set preserve_thinking = preserve_thinking | default(false) -%}
 {# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
 {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
 {%- if not preserve_thinking -%}
 {%- for idx in range(messages|length-1, -1, -1) -%}
    {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
        {%- set ns.last_non_tool_call_assistant_msg = idx -%}
        {%- break -%}
    {%- endif -%}
 {%- endfor -%}
 {%- endif -%}
 {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
 {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
 {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
 {%- if tools -%}
  {%- if tools_ts_str -%}
    <|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
  {%- else -%}
    <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
  {%- endif -%}
 {%- endif -%}
 {%- for message in hist_msgs -%}
  {{set_roles(message)}}
  {%- if message['role'] == 'assistant' -%}
    <think></think>{{render_content(message)}}
    {%- if message.get('tool_calls') -%}
      {{render_toolcalls(message)}}
    {%- endif -%}
  {%- elif message['role'] == 'tool' -%}
    {%- set tool_call_id = message.tool_call_id -%}
    ## Return of {{ tool_call_id }}
 {{render_content(message)}}
  {%- elif message['content'] is not none -%}
    {{render_content(message)}}
  {%- endif -%}
  <|im_end|>
 {%- endfor -%}
 {%- for message in suffix_msgs -%}
  {{set_roles(message)}}
  {%- if message['role'] == 'assistant' -%}
    {%- if thinking is defined and thinking is false and preserve_thinking is false -%}
    <think></think>{{render_content(message)}}
    {%- else -%}
    {%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
    <think>{{rc}}</think>{{render_content(message)}}
    {%- endif -%}
    {%- if message.get('tool_calls') -%}
     {{render_toolcalls(message)}}
    {%- endif -%}
  {%- elif message['role'] == 'tool' -%}
    {%- set tool_call_id = message.tool_call_id -%}
    ## Return of {{ tool_call_id }}
 {{render_content(message)}}
  {%- elif message['content'] is not none -%}
    {{render_content(message)}}
  {%- endif -%}
  <|im_end|>
 {%- endfor -%}
 {%- if add_generation_prompt -%}
  <|im_assistant|>assistant<|im_middle|>
  {%- if thinking is defined and thinking is false -%}
  <think></think>
  {%- else -%}
  <think>
  {%- endif -%}
 {%- endif -%}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,53 @@
 {
  "architectures": [
    "DeepseekV3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "aux_loss_alpha": 1e-06,
  "bos_token_id": 0,
  "dtype": "bfloat16",
  "eos_token_id": 151336,
  "first_k_dense_replace": 1,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "kv_lora_rank": 512,
  "max_position_embeddings": 131072,
  "model_type": "deepseek_v3",
  "moe_intermediate_size": 1408,
  "moe_layer_freq": 1,
  "n_group": 1,
  "n_routed_experts": 64,
  "n_shared_experts": 1,
  "norm_topk_prob": true,
  "num_attention_heads": 16,
  "num_experts_per_tok": 8,
  "num_hidden_layers": 28,
  "num_key_value_heads": 16,
  "pretraining_tp": 1,
  "q_lora_rank": null,
  "qk_head_dim": 192,
  "qk_nope_head_dim": 128,
  "qk_rope_head_dim": 64,
  "rms_norm_eps": 1e-06,
  "rope_interleave": true,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "routed_scaling_factor": 2.446,
  "scoring_func": "sigmoid",
  "tie_word_embeddings": false,
  "topk_group": 1,
  "topk_method": "noaux_tc",
  "transformers_version": "4.57.6",
  "use_cache": true,
  "v_head_dim": 128,
  "vocab_size": 163840,
  "auto_map": {
    "AutoConfig": "configuration_deepseek.DeepseekV3Config",
    "AutoModel": "modeling_deepseek.DeepseekV3Model",
    "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
  }
 }
--- a/configuration_deepseek.py
+++ b/configuration_deepseek.py
@@ -0,0 +1,214 @@
 # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 class DeepseekV3Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DeepSeek-V3.
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 129280):
            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`DeepseekV3Model`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 11008):
            Dimension of the MLP representations.
        moe_intermediate_size (`int`, *optional*, defaults to 1407):
            Dimension of the MoE representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer decoder.
        num_nextn_predict_layers (`int`, *optional*, defaults to 1):
            Number of nextn predict layers in the DeepSeekV3 Model.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        n_shared_experts (`int`, *optional*, defaults to None):
            Number of shared experts, None means dense model.
        n_routed_experts (`int`, *optional*, defaults to None):
            Number of routed experts, None means dense model.
        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor or routed experts.
        topk_method (`str`, *optional*, defaults to `gready`):
            Topk method used in routed gate.
        n_group (`int`, *optional*, defaults to None):
            Number of groups for routed experts.
        topk_group (`int`, *optional*, defaults to None):
            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
        num_experts_per_tok (`int`, *optional*, defaults to None):
            Number of selected experts, None means dense model.
        moe_layer_freq (`int`, *optional*, defaults to 1):
            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
        first_k_dense_replace (`int`, *optional*, defaults to 0):
            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
                                                            \--k dense layers--/
        norm_topk_prob (`bool`, *optional*, defaults to False):
            Whether to normalize the weights of the routed experts.
        scoring_func (`str`, *optional*, defaults to 'softmax'):
            Method of computing expert weights.
        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
            Auxiliary loss weight coefficient.
        seq_aux = (`bool`, *optional*, defaults to True):
            Whether to compute the auxiliary loss for each individual sample.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 1):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            End of stream token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
            issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
            `max_position_embeddings` to the expected new maximum.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
    ```python
    >>> from transformers import DeepseekV3Model, DeepseekV3Config
    >>> # Initializing a Deepseek-V3 style configuration
    >>> configuration = DeepseekV3Config()
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    model_type = "deepseek_v3"
    keys_to_ignore_at_inference = ["past_key_values"]
    def __init__(
        self,
        vocab_size=129280,
        hidden_size=7168,
        intermediate_size=18432,
        moe_intermediate_size=2048,
        num_hidden_layers=61,
        num_nextn_predict_layers=1,
        num_attention_heads=128,
        num_key_value_heads=128,
        n_shared_experts=1,
        n_routed_experts=256,
        ep_size=1,
        routed_scaling_factor=2.5,
        kv_lora_rank=512,
        q_lora_rank=1536,
        qk_rope_head_dim=64,
        v_head_dim=128,
        qk_nope_head_dim=128,
        topk_method='noaux_tc',
        n_group=8,
        topk_group=4,
        num_experts_per_tok=8,
        moe_layer_freq=1,
        first_k_dense_replace=3,
        norm_topk_prob=True,
        scoring_func='sigmoid',
        aux_loss_alpha=0.001,
        seq_aux=True,
        hidden_act="silu",
        max_position_embeddings=4096,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=None,
        bos_token_id=0,
        eos_token_id=1,
        pretraining_tp=1,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        attention_bias=False,
        attention_dropout=0.0,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.moe_intermediate_size = moe_intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.num_attention_heads = num_attention_heads
        self.n_shared_experts = n_shared_experts
        self.n_routed_experts = n_routed_experts
        self.ep_size = ep_size
        self.routed_scaling_factor = routed_scaling_factor
        self.kv_lora_rank = kv_lora_rank
        self.q_lora_rank = q_lora_rank
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.qk_nope_head_dim = qk_nope_head_dim
        self.topk_method = topk_method
        self.n_group = n_group
        self.topk_group = topk_group
        self.num_experts_per_tok = num_experts_per_tok
        self.moe_layer_freq = moe_layer_freq
        self.first_k_dense_replace = first_k_dense_replace
        self.norm_topk_prob = norm_topk_prob
        self.scoring_func = scoring_func
        self.aux_loss_alpha = aux_loss_alpha
        self.seq_aux = seq_aux
        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,6 @@
 {
  "_from_model_config": true,
  "bos_token_id": 0,
  "eos_token_id": 151336,
  "transformers_version": "4.57.6"
 }
--- a/model-00001-of-00007.safetensors
+++ b/model-00001-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:854f96570da76c4a84c0c9f8e5d549b3f96806dfe0f91d69072825c5b56d63f6
 size 5045355830
--- a/model-00002-of-00007.safetensors
+++ b/model-00002-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:9f3d627e0d746f793bca3bbf8c6fed8f934b4a1e6384c7379807ef2f59b4ba8e
 size 4992426704
--- a/model-00003-of-00007.safetensors
+++ b/model-00003-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:401bfa37d3395681daf6907d2e3d3535b459f6af9c6882e54b47a0b85b7eac87
 size 4998709015
--- a/model-00004-of-00007.safetensors
+++ b/model-00004-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:398ea7f1e243e7d602ab427119b064241ec4eff7de88ef2117c025c1c82fd99e
 size 4996087805
--- a/model-00005-of-00007.safetensors
+++ b/model-00005-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:d9e2aa6676df7ec770d8d2874fc90360f971600dd94d1104335b224879f25b5e
 size 4995049023
--- a/model-00006-of-00007.safetensors
+++ b/model-00006-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:ce91b0332b01f40e38a55bf9964a1561aee2d5cc68b98346f999575f76d71426
 size 4996087783
--- a/model-00007-of-00007.safetensors
+++ b/model-00007-of-00007.safetensors
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:5cf77f837121e708934709c12b2bcdb5809a9f0e000948c218d3f5945c561170
 size 2561997258
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
--- a/modeling_deepseek.py
+++ b/modeling_deepseek.py
--- a/preview-banner.png
+++ b/preview-banner.png
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:bbf154522311eb0af3d4e645c9da47c99a731a11ded726cdefebfd131966b610
 size 861396
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,40 @@
 {
  "additional_special_tokens": [
    "<|endoftext|>",
    "[MASK]",
    "[gMASK]",
    "[sMASK]",
    "<sop>",
    "<eop>",
    "<|system|>",
    "<|user|>",
    "<|assistant|>",
    "<|observation|>",
    "<|begin_of_image|>",
    "<|end_of_image|>",
    "<|begin_of_video|>",
    "<|end_of_video|>",
    "<|begin_of_audio|>",
    "<|end_of_audio|>",
    "<|begin_of_transcription|>",
    "<|end_of_transcription|>",
    "<|code_prefix|>",
    "<|code_middle|>",
    "<|code_suffix|>",
    "/nothink"
  ],
  "eos_token": {
    "content": "<|user|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": {
    "content": "<|begin_of_video|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  }
 }
--- a/tokenizer.json
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
 version https://git-lfs.github.com/spec/v1
 oid sha256:bda8e2146c3bb7b7e0fc96dcc4f0aeff041c6c27952e3ace0665663ebff346ba
 size 19970700
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
@@ -0,0 +1,325 @@
 {
  "added_tokens_decoder": {
    "151329": {
      "content": "<|endoftext|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151330": {
      "content": "[MASK]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151331": {
      "content": "[gMASK]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151332": {
      "content": "[sMASK]",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151333": {
      "content": "<sop>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151334": {
      "content": "<eop>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151335": {
      "content": "<|system|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151336": {
      "content": "<|user|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151337": {
      "content": "<|assistant|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151338": {
      "content": "<|observation|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151339": {
      "content": "<|begin_of_image|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151340": {
      "content": "<|end_of_image|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151341": {
      "content": "<|begin_of_video|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151342": {
      "content": "<|end_of_video|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151343": {
      "content": "<|begin_of_audio|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151344": {
      "content": "<|end_of_audio|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151345": {
      "content": "<|begin_of_transcription|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151346": {
      "content": "<|end_of_transcription|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151347": {
      "content": "<|code_prefix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151348": {
      "content": "<|code_middle|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151349": {
      "content": "<|code_suffix|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151350": {
      "content": "<think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151351": {
      "content": "</think>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151352": {
      "content": "<tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151353": {
      "content": "</tool_call>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151354": {
      "content": "<tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151355": {
      "content": "</tool_response>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151356": {
      "content": "<arg_key>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151357": {
      "content": "</arg_key>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151358": {
      "content": "<arg_value>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151359": {
      "content": "</arg_value>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151360": {
      "content": "/nothink",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": true
    },
    "151361": {
      "content": "<|begin_of_box|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151362": {
      "content": "<|end_of_box|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151363": {
      "content": "<|image|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    },
    "151364": {
      "content": "<|video|>",
      "lstrip": false,
      "normalized": false,
      "rstrip": false,
      "single_word": false,
      "special": false
    }
  },
  "additional_special_tokens": [
    "<|endoftext|>",
    "[MASK]",
    "[gMASK]",
    "[sMASK]",
    "<sop>",
    "<eop>",
    "<|system|>",
    "<|user|>",
    "<|assistant|>",
    "<|observation|>",
    "<|begin_of_image|>",
    "<|end_of_image|>",
    "<|begin_of_video|>",
    "<|end_of_video|>",
    "<|begin_of_audio|>",
    "<|end_of_audio|>",
    "<|begin_of_transcription|>",
    "<|end_of_transcription|>",
    "<|code_prefix|>",
    "<|code_middle|>",
    "<|code_suffix|>",
    "/nothink"
  ],
  "clean_up_tokenization_spaces": false,
  "do_lower_case": false,
  "eos_token": "<|user|>",
  "extra_special_tokens": {},
  "model_max_length": 128000,
  "pad_token": "<|begin_of_video|>",
  "padding_side": "left",
  "remove_space": false,
  "tokenizer_class": "PreTrainedTokenizerFast"
 }