初始化项目,由ModelHub XC社区提供模型
Model: inclusionAI/Ring-lite-distill-preview Source: Original Platform
This commit is contained in:
49
.gitattributes
vendored
Normal file
49
.gitattributes
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.db* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ark* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gguf* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ggml filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.llamafile* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 inclusionAI
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
113
README.md
Normal file
113
README.md
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
---
|
||||||
|
license: mit
|
||||||
|
language:
|
||||||
|
- zh
|
||||||
|
- en
|
||||||
|
base_model:
|
||||||
|
- inclusionAI/Ling-lite
|
||||||
|
pipeline_tag: text-generation
|
||||||
|
library_name: transformers
|
||||||
|
---
|
||||||
|
|
||||||
|
# Ring-lite-distill-preview
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://huggingface.co/inclusionAI/Ring-lite-distill-preview/resolve/main/ant-bailing.png" width="100"/>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
🤗 <a href="https://huggingface.co/inclusionAI">Hugging Face</a>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Ring-lite-distill-preview is an MoE LLM provided and open-sourced by InclusionAI, which has 16.8B parameters with 2.75B activated parameters. It was fine-tuned from [Ling-lite](https://modelscope.cn/models/inclusionAI/Ling-lite) using extensive reasoning-focused instruction data. This model delivers performance comparable to DeepSeek-R1-Distill-Qwen-7B on reasoning benchmarks while achieving better results on general benchmarks, especially superior performance on function-calling evaluation benchmarks (e.g., TEval, BFCl_v2) and instruction-following benchmarks (e.g., IFEval). This demonstrates that Ring-lite-distill is a more balanced and versatile model. Additionaly, it maintains competitive latency and throughput compared to other reasoning LLMs of similar size.
|
||||||
|
|
||||||
|
## Model Downloads
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
| **Model** | **#Total Params** | **#Activated Params** | **Context Length** | **Download** |
|
||||||
|
| :----------------: | :---------------: | :-------------------: | :----------------: | :----------: |
|
||||||
|
| Ring-lite-distill-preview | 16.8B | 2.75B | 64K | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ring-lite-distill) |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
In order to fully evaluate the model's performance, we examined Ring-lite-distill-preview in terms of both reasoning ability and general ability.
|
||||||
|
### Reasoning ability
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
| **Model** | **AIME24** | **MATH-500** | **GPQA-diamond** | **LiveCodeBench** |
|
||||||
|
| :----------------: | :---------------: | :-------------------: | :----------------: | :----------: |
|
||||||
|
| DeepSeek-R1-Distill-Qwen-7B (reported) | 55.5 | 92.8 | 49.1 | 37.6 |
|
||||||
|
| DeepSeek-R1-Distill-Qwen-7B (reproduce) | 53.2 | 93.7 | 50.4 | 36.5 |
|
||||||
|
| Ring-lite-distill-preview | 56.3 | 93.7 | 46.2 | 31.9 |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
### General ability
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
| **Model** | **IFEval** | **T-eval** | **BFCL_v2** | **MMLU** |
|
||||||
|
| :----------------: | :---------------: | :-------------------: | :----------------: | :----------: |
|
||||||
|
| DeepSeek-R1-Distill-Qwen-7B (reproduce) | 39.3 | 26.9 | 38.9 | 44.1 |
|
||||||
|
| Ring-lite-distill-preview | 75.3 | 81.3 | 63.0 | 63.3 |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
More details will be reported in our [technical report](https://github.com/inclusionAI/Ring/blob/main/Ring_Lite_Distill_Preview.pdf).
|
||||||
|
|
||||||
|
## Quickstart
|
||||||
|
|
||||||
|
### 🤗 Hugging Face Transformers
|
||||||
|
Here is a code snippet to show you how to use the chat model with `transformers`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
model_name = "inclusionAI/Ring-lite-distill-preview"
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
prompt = "Give me a short introduction to large language models."
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are Ring, an assistant created by inclusionAI"},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
]
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True
|
||||||
|
)
|
||||||
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
generated_ids = model.generate(
|
||||||
|
**model_inputs,
|
||||||
|
max_new_tokens=8192
|
||||||
|
)
|
||||||
|
generated_ids = [
|
||||||
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||||
|
]
|
||||||
|
|
||||||
|
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dataset
|
||||||
|
The training data of Ring-lite-distill-preview will be released soon.
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
Please refer to [GitHub](https://github.com/inclusionAI/Ring/blob/main/README.md)
|
||||||
|
|
||||||
|
## License
|
||||||
|
This code repository is licensed under [the MIT License](https://huggingface.co/inclusionAI/Ring-lite-distill/blob/main/LICENSE).
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
[TBD]
|
||||||
BIN
ant-bailing.png
Normal file
BIN
ant-bailing.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 57 KiB |
44
config.json
Normal file
44
config.json
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"BailingMoeForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"auto_map": {
|
||||||
|
"AutoConfig": "configuration_bailing_moe.BailingMoeConfig",
|
||||||
|
"AutoModel": "modeling_bailing_moe.BailingMoeModel",
|
||||||
|
"AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM"
|
||||||
|
},
|
||||||
|
"eos_token_id": 126081,
|
||||||
|
"pad_token_id": 126081,
|
||||||
|
"first_k_dense_replace": 0,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 2048,
|
||||||
|
"initializer_range": 0.006,
|
||||||
|
"intermediate_size": 5632,
|
||||||
|
"max_position_embeddings": 16384,
|
||||||
|
"model_type": "bailing_moe",
|
||||||
|
"moe_intermediate_size": 1408,
|
||||||
|
"num_experts": 64,
|
||||||
|
"num_shared_experts": 2,
|
||||||
|
"norm_topk_prob": true,
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_experts_per_tok": 6,
|
||||||
|
"num_hidden_layers": 28,
|
||||||
|
"num_key_value_heads": 4,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 600000,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"transformers_version": "4.36.0",
|
||||||
|
"use_cache": true,
|
||||||
|
"use_bias": false,
|
||||||
|
"use_qkv_bias": false,
|
||||||
|
"vocab_size": 126464,
|
||||||
|
"output_router_logits": false,
|
||||||
|
"embedding_dropout": 0.0,
|
||||||
|
"norm_head": true,
|
||||||
|
"norm_softmax": false,
|
||||||
|
"output_dropout": 0.0
|
||||||
|
}
|
||||||
1
configuration.json
Normal file
1
configuration.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"framework":"Pytorch","task":"text-generation"}
|
||||||
78
configuration_bailing_moe.py
Normal file
78
configuration_bailing_moe.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
""" Bailing MoE model configuration """
|
||||||
|
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
class BailingMoeConfig(PretrainedConfig):
|
||||||
|
model_type = "bailing_moe"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=30592,
|
||||||
|
hidden_size=1024,
|
||||||
|
intermediate_size=None,
|
||||||
|
num_hidden_layers=24,
|
||||||
|
num_attention_heads=16,
|
||||||
|
num_key_value_heads=0,
|
||||||
|
hidden_act="silu",
|
||||||
|
use_qkv_bias=False, # bailing only
|
||||||
|
use_bias=True, # bailing only
|
||||||
|
rms_norm_eps=1e-05,
|
||||||
|
norm_head=False, # bailing only
|
||||||
|
tie_word_embeddings=False, # PretrainedConfig key, here change default value.
|
||||||
|
embedding_dropout=0.1,
|
||||||
|
attention_dropout=0.1,
|
||||||
|
output_dropout=0.1,
|
||||||
|
initializer_range=0.02,
|
||||||
|
max_position_embeddings=16384,
|
||||||
|
rope_theta=10000.0,
|
||||||
|
use_cache=True,
|
||||||
|
use_sliding_window=False,
|
||||||
|
sliding_window=4096,
|
||||||
|
max_window_layers=28,
|
||||||
|
rope_scaling=None,
|
||||||
|
pad_token_id=126081,
|
||||||
|
num_experts=16,
|
||||||
|
num_shared_experts=0,
|
||||||
|
num_experts_per_tok=2,
|
||||||
|
norm_topk_prob=True,
|
||||||
|
moe_intermediate_size=None,
|
||||||
|
first_k_dense_replace=0,
|
||||||
|
head_dim=None,
|
||||||
|
output_router_logits=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.num_key_value_heads = num_key_value_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.use_qkv_bias = use_qkv_bias
|
||||||
|
self.use_bias = use_bias
|
||||||
|
self.norm_head = norm_head
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.embedding_dropout = embedding_dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.output_dropout = output_dropout
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.rope_theta = rope_theta
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.use_sliding_window = use_sliding_window
|
||||||
|
self.sliding_window = sliding_window
|
||||||
|
self.max_window_layers = max_window_layers
|
||||||
|
self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
|
||||||
|
self.rope_scaling = rope_scaling
|
||||||
|
|
||||||
|
# MoE configs
|
||||||
|
self.num_experts = num_experts
|
||||||
|
self.num_shared_experts = num_shared_experts
|
||||||
|
self.num_experts_per_tok = num_experts_per_tok
|
||||||
|
self.norm_topk_prob = norm_topk_prob
|
||||||
|
self.moe_intermediate_size = moe_intermediate_size
|
||||||
|
self.first_k_dense_replace = first_k_dense_replace
|
||||||
|
self.output_router_logits = output_router_logits
|
||||||
|
|
||||||
|
super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:cf3d311c600c7dff74d973d95660c8464b310a1d177d694b0cac1a1fb81f309a
|
||||||
|
size 9305327072
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:70b127b1b5d3a808585edcffe78ebd60c52bf58e5b0d095af51c7cc62990a381
|
||||||
|
size 9305328272
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:63eb2564a66331fb8cf5520782be797c467e4c890e4e44a338bfc90ee12bddbf
|
||||||
|
size 9305328672
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d1cc12c93b2b9d4f6880f96c9dc9c1ebe415ace99f39cbf398e673be1a23f3e4
|
||||||
|
size 5688662080
|
||||||
5611
model.safetensors.index.json
Normal file
5611
model.safetensors.index.json
Normal file
File diff suppressed because it is too large
Load Diff
1549
modeling_bailing_moe.py
Normal file
1549
modeling_bailing_moe.py
Normal file
File diff suppressed because it is too large
Load Diff
15
special_tokens_map.json
Normal file
15
special_tokens_map.json
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<|number_end|>",
|
||||||
|
"<|arithmetic_start|>",
|
||||||
|
"</role>",
|
||||||
|
"<|arithmetic_end|>",
|
||||||
|
"<role>",
|
||||||
|
"<|number_start|>"
|
||||||
|
],
|
||||||
|
"bos_token": "<|startoftext|>",
|
||||||
|
"cls_token": "[CLS]",
|
||||||
|
"eos_token": "<|endoftext|>",
|
||||||
|
"gmask_token": "[gMASK]",
|
||||||
|
"pad_token": "<|endoftext|>"
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:e4263f84d1ae750eb427be937562c33737b5bb035fe107fd414d27c766d1f629
|
||||||
|
size 6098421
|
||||||
25
tokenizer_config.json
Normal file
25
tokenizer_config.json
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"add_bos_token": false,
|
||||||
|
"add_eos_token": false,
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<role>",
|
||||||
|
"</role>",
|
||||||
|
"<|arithmetic_start|>",
|
||||||
|
"<|arithmetic_end|>",
|
||||||
|
"<|number_start|>",
|
||||||
|
"<|number_end|>"
|
||||||
|
],
|
||||||
|
"bos_token": "<|startoftext|>",
|
||||||
|
"chat_template": "{% for message in messages %}{% set role = message['role'] | lower %}{% if role == 'user' %}{% set role = 'HUMAN' %}{% endif %}{% set role = role | upper %}{{ '<role>' + role + '</role>' + message['content'].split('</think>')[-1].lstrip('\\n') }}{% endfor %}{% if add_generation_prompt %}{{ '<role>ASSISTANT</role><think>' }}{% endif %}",
|
||||||
|
"clean_up_tokenization_spaces": false,
|
||||||
|
"cls_token": "[CLS]",
|
||||||
|
"eos_token": "<|endoftext|>",
|
||||||
|
"gmask_token": "[gMASK]",
|
||||||
|
"merges_file": null,
|
||||||
|
"model_max_length": 1000000000000000019884624838656,
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
"tokenizer_class": "PreTrainedTokenizerFast",
|
||||||
|
"trust_remote_code": true,
|
||||||
|
"vocab_file": null,
|
||||||
|
"fast_tokenizer": true
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user