初始化项目,由ModelHub XC社区提供模型

Model: Godcat252/Besttop977
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-23 21:14:22 +08:00
commit d36868b650
20 changed files with 8552 additions and 0 deletions

37
.gitattributes vendored Normal file
View File

@@ -0,0 +1,37 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
preview-banner.png filter=lfs diff=lfs merge=lfs -text
tokenizer.json filter=lfs diff=lfs merge=lfs -text

190
LICENSE Normal file
View File

@@ -0,0 +1,190 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding any notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2026 Trillion Labs
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

211
README.md Normal file
View File

@@ -0,0 +1,211 @@
---
license: apache-2.0
language:
- en
- ko
library_name: transformers
tags:
- moe
- mixture-of-experts
- gravity
- trillion-labs
- chat
- post-trained
- preview
pipeline_tag: text-generation
base_model:
- trillionlabs/Gravity-16B-A3B-Base
---
<p align="center">
<img src="preview-banner.png" alt="Gravity-16B-A3B-Preview" width="100%">
</p>
# Gravity-16B-A3B-Preview
**Gravity-16B-A3B-Preview** is a post-trained language model built on [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) by [Trillion Labs](https://trillionlabs.co). Starting from the base model, it underwent context length extension (32K → 128K), supervised fine-tuning (SFT), and reinforcement learning (GRPO) focused on science and code.
This is a preview release offering a strong balance of capability, efficiency, and long-context support for its size. We are actively working on agentic capabilities for the full release.
## Model Summary
| Property | Value |
|---|---|
| **Base Model** | [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) |
| **Total Parameters** | 16.24B |
| **Active Parameters** | 3.16B |
| **Architecture** | GravityMoE |
| **Context Length** | 131,072 tokens (128K) |
| **Precision** | bf16 |
| **License** | Apache 2.0 |
For full architectural details (MLA, MoE routing, tokenizer, etc.), see the [base model card](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base).
## Post-Training Pipeline
Starting from [Gravity-16B-A3B-Base](https://huggingface.co/trillionlabs/Gravity-16B-A3B-Base) (pretrained on ~5.5T tokens):
1. **Context Length Extension** — Extended from 32K to 128K tokens.
2. **Supervised Fine-Tuning (SFT)** — Instruction tuning for general chat and task-following capabilities.
3. **Reinforcement Learning (GRPO)** — Single-step Group Relative Policy Optimization focused on science and code domains.
Agentic RL and multi-turn RL stages are in progress and will be included in future releases.
## Evaluation Results
| Category | Benchmark | Metric | Score |
|---|---|---|---|
| **Math** | AIME 2024 | acc | 43.3 |
| | GSM8K | acc | 91.8 |
| | MATH500 | acc | 88.6 |
| **Code** | HumanEval | pass@1 | 89.0 |
| | MBPP | pass@1 | 96.0 |
| | LiveCodeBench V6 | pass@1 | 41.0 |
| **Knowledge** | MMLU | acc | 80.1 |
| | MMLU-Pro | acc | 71.5 |
| | BBH | acc | 79.24 |
| **Science** | GPQA Diamond | acc | 55.1 |
| | Arc Challenge | acc | 92.32 |
| | ChemBench | acc | 68.6 |
| | Molang Bench (Editing) | SMILEs validty / Tanimoto similarity / Accuracy | 70.83 / 86.43 / 43.23 |
| | Molang Bench (Generation) | SMILEs validty / Tanimoto similarity / Accuracy | 35.96 / 43.24 / 1.69 |
| **Instruction Following** | IFEval | instruct level loose | 84.53 |
| | IFBench | instruct level loose | 46.51 |
| **Agentic** | Tau^2 (Telecom) | pass@1 | 71.93 |
| | Scicode | sub problem level | 18.8 |
| | Terminal Bench | pass@1 | 21.25 |
| **Long Context** | AA-LCR | pass@1 | 21.0 |
### Comparison with Moonlight-16B-A3B-Instruct
| Category | Benchmark | Metric | Gravity-16B-A3B-Preview | Moonlight-16B-A3B-Instruct |
|---|---|---|---|---|
| **Math** | GSM8K | acc | 91.8 | 77.4 |
| **Code** | HumanEval | pass@1 | 89.0 | 48.1 |
| | MBPP | pass@1 | 96.0 | 63.8 |
| **Knowledge** | MMLU | acc | 80.1 | 70.0 |
| | MMLU-Pro | acc | 71.5 | 42.4 |
| | BBH | acc | 79.24 | 65.2 |
> Note: We include Moonlight-16B-A3B-Instruct for comparison since it is similar in size to our model. Moonlight-16B-A3B-Instruct scores are taken from the numbers reported in their own technical report.
With 3.16B active parameters, 128K context, and broad coverage across math, code, and knowledge benchmarks, the model offers a strong balance of capability and efficiency for its size.
Agentic benchmarks (multi-step tool use, code execution) are not yet a focus of this release. We are actively training on agentic tasks and will include those results in the next release.
## Quickstart
### Installation
```bash
pip install "transformers>=5.0" torch
```
### Using Transformers
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_name = "trillionlabs/Gravity-16B-A3B-Preview"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Solve the equation: x^3 - 6x^2 + 11x - 6 = 0"},
]
input_ids = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
output = model.generate(input_ids, max_new_tokens=1024, do_sample=True, temperature=0.7)
print(tokenizer.decode(output[0][input_ids.shape[-1]:], skip_special_tokens=True))
```
## Deployment
> **Note:** We are working on upstreaming native GravityMoE support to [SGLang](https://github.com/sgl-project/sglang). Until the PR is merged, please use the installation steps below.
### SGLang
Install SGLang from the [sglang-gravity](https://github.com/trillion-labs/sglang-gravity) fork:
```bash
pip install "sglang[all] @ git+https://github.com/trillion-labs/sglang-gravity.git#subdirectory=python"
```
Launch the server:
```bash
python3 -m sglang.launch_server \
--model-path trillionlabs/Gravity-16B-A3B-Preview \
--host 0.0.0.0 \
--port 30000 \
--tp 8 \
--trust-remote-code \
--moe-runner-backend triton \
--tool-call-parser glm45 \
--reasoning-parser glm45 \
--dtype bfloat16
```
Send a request:
```bash
curl http://localhost:30000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "trillionlabs/Gravity-16B-A3B-Preview",
"messages": [{"role": "user", "content": "What is the capital of South Korea?"}],
"max_tokens": 128,
"temperature": 0.7
}'
```
## Limitations
- This is a preview release. Agentic and multi-turn capabilities are under active development.
- The model may generate factually incorrect, biased, or harmful content.
- Performance may degrade on languages not well-represented in the training data.
## Acknowledgements
This model was developed as part of a collaborative research initiative led by **Lunit** and **Trillion Labs**, with a focus on advancing foundation models for science and healthcare.
- **Lunit** — Project lead and medical AI research
- **Trillion Labs** — Model architecture, pretraining, and infrastructure
- **Aigen Science** — Biomedical AI and drug discovery research
- **SK Biopharmaceuticals** — AI-driven drug development and digital healthcare advisory
- **Kakao Healthcare** — Medical data standardization and platform support
We also thank the following participating institutions for their contributions: KAIST (Yoonjae Choi, Taekyun Kim, Jong Chul Ye, Hyunwoo Kim, Seunghoon Hong), Seoul National University (Yousung Jung), Rebellions, Standigm, NHIS Ilsan Hospital, Yongin Severance Hospital, Gangdong Kyung Hee University Hospital, Kyung Hee University Medical Center, Korea University, Konyang University Hospital, Ewha Womans University Seoul Hospital, Keimyung University Dongsan Medical Center, Pusan National University Yangsan Hospital, and D-Circle.
This work was supported by the **AI Specialized Foundation Model Project** (인공지능 특화 파운데이션 모델 프로젝트), funded by the **Ministry of Science and ICT** (과학기술정보통신부, MSIT) and managed by the **National IT Industry Promotion Agency** (NIPA, 정보통신산업진흥원).
## License
This model is released under the [Apache 2.0 License](LICENSE).
## Citation
```bibtex
@misc{gravity-preview-2026,
title={Gravity-16B-A3B-Preview},
author={Trillion Labs},
year={2026},
url={https://huggingface.co/trillionlabs/Gravity-16B-A3B-Preview}
}
```
## Contact
- Website: [trillionlabs.co](https://trillionlabs.co)
- Website: [lunit.io](https://www.lunit.io)

112
chat_template.jinja Normal file
View File

@@ -0,0 +1,112 @@
{%- macro render_content(msg) -%}
{%- set c = msg.get('content') -%}
{%- if c is string -%}
{{ c }}
{%- elif c is not none -%}
{% for content in c -%}
{% if content['type'] == 'image' or content['type'] == 'image_url' -%}
<|media_begin|>image<|media_content|><|media_pad|><|media_end|>
{% elif content['type'] == 'video' or content['type']== 'video_url'-%}
<|kimi_k25_video_placeholder|>
{% else -%}
{{ content['text'] }}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- endmacro -%}
{% macro set_roles(message) -%}
{%- set role_name = message.get('name') or message['role'] -%}
{%- if message['role'] == 'user' -%}
<|im_user|>{{role_name}}<|im_middle|>
{%- elif message['role'] == 'assistant' -%}
<|im_assistant|>{{role_name}}<|im_middle|>
{%- else -%}
<|im_system|>{{role_name}}<|im_middle|>
{%- endif -%}
{%- endmacro -%}
{%- macro render_toolcalls(message) -%}
<|tool_calls_section_begin|>
{%- for tool_call in message['tool_calls'] -%}
{%- set formatted_id = tool_call['id'] -%}
<|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
{%- endfor -%}
<|tool_calls_section_end|>
{%- endmacro -%}
{%- set preserve_thinking = preserve_thinking | default(false) -%}
{# Find last non-tool-call assistant message. If preserve_thinking, keep -1 so hist is empty and all msgs use suffix (retain reasoning). #}
{%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
{%- if not preserve_thinking -%}
{%- for idx in range(messages|length-1, -1, -1) -%}
{%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
{%- set ns.last_non_tool_call_assistant_msg = idx -%}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
{%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
{%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
{%- if tools -%}
{%- if tools_ts_str -%}
<|im_system|>tool_declare<|im_middle|>{{ tools_ts_str }}<|im_end|>
{%- else -%}
<|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
{%- endif -%}
{%- endif -%}
{%- for message in hist_msgs -%}
{{set_roles(message)}}
{%- if message['role'] == 'assistant' -%}
<think></think>{{render_content(message)}}
{%- if message.get('tool_calls') -%}
{{render_toolcalls(message)}}
{%- endif -%}
{%- elif message['role'] == 'tool' -%}
{%- set tool_call_id = message.tool_call_id -%}
## Return of {{ tool_call_id }}
{{render_content(message)}}
{%- elif message['content'] is not none -%}
{{render_content(message)}}
{%- endif -%}
<|im_end|>
{%- endfor -%}
{%- for message in suffix_msgs -%}
{{set_roles(message)}}
{%- if message['role'] == 'assistant' -%}
{%- if thinking is defined and thinking is false and preserve_thinking is false -%}
<think></think>{{render_content(message)}}
{%- else -%}
{%- set rc = message.get('reasoning', message.get('reasoning_content', '')) -%}
<think>{{rc}}</think>{{render_content(message)}}
{%- endif -%}
{%- if message.get('tool_calls') -%}
{{render_toolcalls(message)}}
{%- endif -%}
{%- elif message['role'] == 'tool' -%}
{%- set tool_call_id = message.tool_call_id -%}
## Return of {{ tool_call_id }}
{{render_content(message)}}
{%- elif message['content'] is not none -%}
{{render_content(message)}}
{%- endif -%}
<|im_end|>
{%- endfor -%}
{%- if add_generation_prompt -%}
<|im_assistant|>assistant<|im_middle|>
{%- if thinking is defined and thinking is false -%}
<think></think>
{%- else -%}
<think>
{%- endif -%}
{%- endif -%}

53
config.json Normal file
View File

@@ -0,0 +1,53 @@
{
"architectures": [
"DeepseekV3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"aux_loss_alpha": 1e-06,
"bos_token_id": 0,
"dtype": "bfloat16",
"eos_token_id": 151336,
"first_k_dense_replace": 1,
"head_dim": 64,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 8192,
"kv_lora_rank": 512,
"max_position_embeddings": 131072,
"model_type": "deepseek_v3",
"moe_intermediate_size": 1408,
"moe_layer_freq": 1,
"n_group": 1,
"n_routed_experts": 64,
"n_shared_experts": 1,
"norm_topk_prob": true,
"num_attention_heads": 16,
"num_experts_per_tok": 8,
"num_hidden_layers": 28,
"num_key_value_heads": 16,
"pretraining_tp": 1,
"q_lora_rank": null,
"qk_head_dim": 192,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"rms_norm_eps": 1e-06,
"rope_interleave": true,
"rope_scaling": null,
"rope_theta": 1000000.0,
"routed_scaling_factor": 2.446,
"scoring_func": "sigmoid",
"tie_word_embeddings": false,
"topk_group": 1,
"topk_method": "noaux_tc",
"transformers_version": "4.57.6",
"use_cache": true,
"v_head_dim": 128,
"vocab_size": 163840,
"auto_map": {
"AutoConfig": "configuration_deepseek.DeepseekV3Config",
"AutoModel": "modeling_deepseek.DeepseekV3Model",
"AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
}
}

214
configuration_deepseek.py Normal file
View File

@@ -0,0 +1,214 @@
# Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
class DeepseekV3Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the DeepSeek-V3.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 129280):
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`DeepseekV3Model`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
moe_intermediate_size (`int`, *optional*, defaults to 1407):
Dimension of the MoE representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_nextn_predict_layers (`int`, *optional*, defaults to 1):
Number of nextn predict layers in the DeepSeekV3 Model.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
n_shared_experts (`int`, *optional*, defaults to None):
Number of shared experts, None means dense model.
n_routed_experts (`int`, *optional*, defaults to None):
Number of routed experts, None means dense model.
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
Scaling factor or routed experts.
topk_method (`str`, *optional*, defaults to `gready`):
Topk method used in routed gate.
n_group (`int`, *optional*, defaults to None):
Number of groups for routed experts.
topk_group (`int`, *optional*, defaults to None):
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
num_experts_per_tok (`int`, *optional*, defaults to None):
Number of selected experts, None means dense model.
moe_layer_freq (`int`, *optional*, defaults to 1):
The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
first_k_dense_replace (`int`, *optional*, defaults to 0):
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
\--k dense layers--/
norm_topk_prob (`bool`, *optional*, defaults to False):
Whether to normalize the weights of the routed experts.
scoring_func (`str`, *optional*, defaults to 'softmax'):
Method of computing expert weights.
aux_loss_alpha (`float`, *optional*, defaults to 0.001):
Auxiliary loss weight coefficient.
seq_aux = (`bool`, *optional*, defaults to True):
Whether to compute the auxiliary loss for each individual sample.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details checkout [this
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
`max_position_embeddings` to the expected new maximum.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
```python
>>> from transformers import DeepseekV3Model, DeepseekV3Config
>>> # Initializing a Deepseek-V3 style configuration
>>> configuration = DeepseekV3Config()
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "deepseek_v3"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=129280,
hidden_size=7168,
intermediate_size=18432,
moe_intermediate_size=2048,
num_hidden_layers=61,
num_nextn_predict_layers=1,
num_attention_heads=128,
num_key_value_heads=128,
n_shared_experts=1,
n_routed_experts=256,
ep_size=1,
routed_scaling_factor=2.5,
kv_lora_rank=512,
q_lora_rank=1536,
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method='noaux_tc',
n_group=8,
topk_group=4,
num_experts_per_tok=8,
moe_layer_freq=1,
first_k_dense_replace=3,
norm_topk_prob=True,
scoring_func='sigmoid',
aux_loss_alpha=0.001,
seq_aux=True,
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=0,
eos_token_id=1,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_nextn_predict_layers = num_nextn_predict_layers
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.ep_size = ep_size
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.topk_method = topk_method
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.moe_layer_freq = moe_layer_freq
self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func
self.aux_loss_alpha = aux_loss_alpha
self.seq_aux = seq_aux
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

6
generation_config.json Normal file
View File

@@ -0,0 +1,6 @@
{
"_from_model_config": true,
"bos_token_id": 0,
"eos_token_id": 151336,
"transformers_version": "4.57.6"
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:854f96570da76c4a84c0c9f8e5d549b3f96806dfe0f91d69072825c5b56d63f6
size 5045355830

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9f3d627e0d746f793bca3bbf8c6fed8f934b4a1e6384c7379807ef2f59b4ba8e
size 4992426704

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:401bfa37d3395681daf6907d2e3d3535b459f6af9c6882e54b47a0b85b7eac87
size 4998709015

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:398ea7f1e243e7d602ab427119b064241ec4eff7de88ef2117c025c1c82fd99e
size 4996087805

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d9e2aa6676df7ec770d8d2874fc90360f971600dd94d1104335b224879f25b5e
size 4995049023

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ce91b0332b01f40e38a55bf9964a1561aee2d5cc68b98346f999575f76d71426
size 4996087783

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5cf77f837121e708934709c12b2bcdb5809a9f0e000948c218d3f5945c561170
size 2561997258

5529
model.safetensors.index.json Normal file

File diff suppressed because it is too large Load Diff

1808
modeling_deepseek.py Normal file

File diff suppressed because it is too large Load Diff

3
preview-banner.png Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bbf154522311eb0af3d4e645c9da47c99a731a11ded726cdefebfd131966b610
size 861396

40
special_tokens_map.json Normal file
View File

@@ -0,0 +1,40 @@
{
"additional_special_tokens": [
"<|endoftext|>",
"[MASK]",
"[gMASK]",
"[sMASK]",
"<sop>",
"<eop>",
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|observation|>",
"<|begin_of_image|>",
"<|end_of_image|>",
"<|begin_of_video|>",
"<|end_of_video|>",
"<|begin_of_audio|>",
"<|end_of_audio|>",
"<|begin_of_transcription|>",
"<|end_of_transcription|>",
"<|code_prefix|>",
"<|code_middle|>",
"<|code_suffix|>",
"/nothink"
],
"eos_token": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|begin_of_video|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

3
tokenizer.json Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bda8e2146c3bb7b7e0fc96dcc4f0aeff041c6c27952e3ace0665663ebff346ba
size 19970700

325
tokenizer_config.json Normal file
View File

@@ -0,0 +1,325 @@
{
"added_tokens_decoder": {
"151329": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151330": {
"content": "[MASK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151331": {
"content": "[gMASK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151332": {
"content": "[sMASK]",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151333": {
"content": "<sop>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151334": {
"content": "<eop>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151335": {
"content": "<|system|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151336": {
"content": "<|user|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151337": {
"content": "<|assistant|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151338": {
"content": "<|observation|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151339": {
"content": "<|begin_of_image|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151340": {
"content": "<|end_of_image|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151341": {
"content": "<|begin_of_video|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151342": {
"content": "<|end_of_video|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151343": {
"content": "<|begin_of_audio|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151344": {
"content": "<|end_of_audio|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151345": {
"content": "<|begin_of_transcription|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151346": {
"content": "<|end_of_transcription|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151347": {
"content": "<|code_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151348": {
"content": "<|code_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151349": {
"content": "<|code_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151350": {
"content": "<think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151351": {
"content": "</think>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151352": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151353": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151354": {
"content": "<tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151355": {
"content": "</tool_response>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151356": {
"content": "<arg_key>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151357": {
"content": "</arg_key>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151358": {
"content": "<arg_value>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151359": {
"content": "</arg_value>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151360": {
"content": "/nothink",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151361": {
"content": "<|begin_of_box|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151362": {
"content": "<|end_of_box|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151363": {
"content": "<|image|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151364": {
"content": "<|video|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"additional_special_tokens": [
"<|endoftext|>",
"[MASK]",
"[gMASK]",
"[sMASK]",
"<sop>",
"<eop>",
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|observation|>",
"<|begin_of_image|>",
"<|end_of_image|>",
"<|begin_of_video|>",
"<|end_of_video|>",
"<|begin_of_audio|>",
"<|end_of_audio|>",
"<|begin_of_transcription|>",
"<|end_of_transcription|>",
"<|code_prefix|>",
"<|code_middle|>",
"<|code_suffix|>",
"/nothink"
],
"clean_up_tokenization_spaces": false,
"do_lower_case": false,
"eos_token": "<|user|>",
"extra_special_tokens": {},
"model_max_length": 128000,
"pad_token": "<|begin_of_video|>",
"padding_side": "left",
"remove_space": false,
"tokenizer_class": "PreTrainedTokenizerFast"
}