初始化项目,由ModelHub XC社区提供模型
Model: inclusionAI/Ring-lite-2506 Source: Original Platform
This commit is contained in:
49
.gitattributes
vendored
Normal file
49
.gitattributes
vendored
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.db* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ark* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
||||||
|
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gguf* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ggml filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.llamafile* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 inclusionAI
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
106
README.md
Normal file
106
README.md
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
---
|
||||||
|
license: mit
|
||||||
|
language:
|
||||||
|
- zh
|
||||||
|
- en
|
||||||
|
base_model:
|
||||||
|
- inclusionAI/Ling-lite-base-1.5
|
||||||
|
---
|
||||||
|
# Ring-lite-2506
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*4QxcQrBlTiAAAAAAQXAAAAgAemJ7AQ/original" width="100"/>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
🤗 <a href="https://huggingface.co/inclusionAI">Hugging Face</a>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
Ring-lite-2506 is a lightweight, fully open-sourced MoE (Mixture of Experts) LLM designed for complex reasoning tasks. It is built upon the publicly available [Ling-lite-1.5](https://huggingface.co/inclusionAI/Ling-lite-1.5) model, which has 16.8B parameters with 2.75B activated parameters. We use a joint training pipeline combining knowledge distillation with reinforcement learning, achieving performance comparable to state-of-the-art (SOTA) small-size reasoning models on challenging benchmarks (AIME, LiveCodeBench, and GPQA-Diamond) while activating only one-third of their parameters.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Model Downloads
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
|
||||||
|
| **Model** | **#Total Params** | **#Activated Params** | **Context Length** | **Download** |
|
||||||
|
| :----------------: | :---------------: | :-------------------: | :----------------: | :----------: |
|
||||||
|
| Ring-lite-2506 | 16.8B | 2.75B | 128K | [🤗 HuggingFace](https://huggingface.co/inclusionAI/Ring-lite-2506) |
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
For a comprehensive evaluation of the quality of our reasoning models, we implemented automatic benchmarks to assess their performance including math, code and science.
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://mdn.alipayobjects.com/huamei_qa8qxu/afts/img/A*iAXESaxrbDcAAAAATtAAAAgAemJ7AQ/original" width="1000"/>
|
||||||
|
<p>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
More details are reported in our [technical report](https://arxiv.org/abs/2506.14731).
|
||||||
|
|
||||||
|
## Quickstart
|
||||||
|
|
||||||
|
### 🤗 Hugging Face Transformers
|
||||||
|
Here is a code snippet to show you how to use the chat model with `transformers`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
model_name = "inclusionAI/Ring-lite-2506"
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
prompt = "Give me a short introduction to large language models."
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are Ring, an assistant created by inclusionAI"},
|
||||||
|
{"role": "user", "content": prompt}
|
||||||
|
]
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True
|
||||||
|
)
|
||||||
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
generated_ids = model.generate(
|
||||||
|
**model_inputs,
|
||||||
|
max_new_tokens=8192
|
||||||
|
)
|
||||||
|
generated_ids = [
|
||||||
|
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
||||||
|
]
|
||||||
|
|
||||||
|
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Dataset
|
||||||
|
The training data of Ring-lite-2506 is release at [Ring-lite-sft-data](https://huggingface.co/datasets/inclusionAI/Ring-lite-sft-data) and [Ring-lite-rl-data](https://huggingface.co/datasets/inclusionAI/Ring-lite-rl-data).
|
||||||
|
|
||||||
|
## Deployment
|
||||||
|
Please refer to [GitHub](https://github.com/inclusionAI/Ring/blob/main/README.md)
|
||||||
|
|
||||||
|
## License
|
||||||
|
This code repository is licensed under [the MIT License](https://huggingface.co/inclusionAI/Ring-lite-2506/blob/main/LICENSE).
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
```
|
||||||
|
@misc{ringteam2025ringlitescalablereasoningc3postabilized,
|
||||||
|
title={Ring-lite: Scalable Reasoning via C3PO-Stabilized Reinforcement Learning for LLMs},
|
||||||
|
author={Ling Team},
|
||||||
|
year={2025},
|
||||||
|
eprint={2506.14731},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.CL},
|
||||||
|
url={https://arxiv.org/abs/2506.14731},
|
||||||
|
}
|
||||||
|
```
|
||||||
44
config.json
Normal file
44
config.json
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"BailingMoeForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"auto_map": {
|
||||||
|
"AutoConfig": "configuration_bailing_moe.BailingMoeConfig",
|
||||||
|
"AutoModel": "modeling_bailing_moe.BailingMoeModel",
|
||||||
|
"AutoModelForCausalLM": "modeling_bailing_moe.BailingMoeForCausalLM"
|
||||||
|
},
|
||||||
|
"eos_token_id": 126081,
|
||||||
|
"pad_token_id": 126081,
|
||||||
|
"first_k_dense_replace": 0,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 2048,
|
||||||
|
"initializer_range": 0.006,
|
||||||
|
"intermediate_size": 1408,
|
||||||
|
"max_position_embeddings": 32768,
|
||||||
|
"model_type": "bailing_moe",
|
||||||
|
"moe_intermediate_size": 1408,
|
||||||
|
"num_experts": 64,
|
||||||
|
"num_shared_experts": 2,
|
||||||
|
"norm_topk_prob": true,
|
||||||
|
"num_attention_heads": 16,
|
||||||
|
"num_experts_per_tok": 6,
|
||||||
|
"num_hidden_layers": 28,
|
||||||
|
"num_key_value_heads": 4,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-06,
|
||||||
|
"rope_scaling": null,
|
||||||
|
"rope_theta": 600000,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"transformers_version": "4.40.0",
|
||||||
|
"use_cache": true,
|
||||||
|
"use_bias": false,
|
||||||
|
"use_qkv_bias": false,
|
||||||
|
"vocab_size": 126464,
|
||||||
|
"output_router_logits": false,
|
||||||
|
"embedding_dropout": 0.0,
|
||||||
|
"norm_head": false,
|
||||||
|
"norm_softmax": false,
|
||||||
|
"output_dropout": 0.0
|
||||||
|
}
|
||||||
1
configuration.json
Normal file
1
configuration.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"framework":"Pytorch","task":"text-generation"}
|
||||||
78
configuration_bailing_moe.py
Normal file
78
configuration_bailing_moe.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
""" Bailing MoE model configuration """
|
||||||
|
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
|
|
||||||
|
|
||||||
|
class BailingMoeConfig(PretrainedConfig):
|
||||||
|
model_type = "bailing_moe"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_size=30592,
|
||||||
|
hidden_size=1024,
|
||||||
|
intermediate_size=None,
|
||||||
|
num_hidden_layers=24,
|
||||||
|
num_attention_heads=16,
|
||||||
|
num_key_value_heads=0,
|
||||||
|
hidden_act="silu",
|
||||||
|
use_qkv_bias=False, # bailing only
|
||||||
|
use_bias=True, # bailing only
|
||||||
|
rms_norm_eps=1e-05,
|
||||||
|
norm_head=False, # bailing only
|
||||||
|
tie_word_embeddings=False, # PretrainedConfig key, here change default value.
|
||||||
|
embedding_dropout=0.1,
|
||||||
|
attention_dropout=0.1,
|
||||||
|
output_dropout=0.1,
|
||||||
|
initializer_range=0.02,
|
||||||
|
max_position_embeddings=16384,
|
||||||
|
rope_theta=10000.0,
|
||||||
|
use_cache=True,
|
||||||
|
use_sliding_window=False,
|
||||||
|
sliding_window=4096,
|
||||||
|
max_window_layers=28,
|
||||||
|
rope_scaling=None,
|
||||||
|
pad_token_id=126081,
|
||||||
|
num_experts=16,
|
||||||
|
num_shared_experts=0,
|
||||||
|
num_experts_per_tok=2,
|
||||||
|
norm_topk_prob=True,
|
||||||
|
moe_intermediate_size=None,
|
||||||
|
first_k_dense_replace=0,
|
||||||
|
head_dim=None,
|
||||||
|
output_router_logits=False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
self.num_hidden_layers = num_hidden_layers
|
||||||
|
self.vocab_size = vocab_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.intermediate_size = intermediate_size
|
||||||
|
self.num_attention_heads = num_attention_heads
|
||||||
|
self.num_key_value_heads = num_key_value_heads
|
||||||
|
self.hidden_act = hidden_act
|
||||||
|
self.use_qkv_bias = use_qkv_bias
|
||||||
|
self.use_bias = use_bias
|
||||||
|
self.norm_head = norm_head
|
||||||
|
self.rms_norm_eps = rms_norm_eps
|
||||||
|
self.embedding_dropout = embedding_dropout
|
||||||
|
self.attention_dropout = attention_dropout
|
||||||
|
self.output_dropout = output_dropout
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
self.max_position_embeddings = max_position_embeddings
|
||||||
|
self.rope_theta = rope_theta
|
||||||
|
self.use_cache = use_cache
|
||||||
|
self.use_sliding_window = use_sliding_window
|
||||||
|
self.sliding_window = sliding_window
|
||||||
|
self.max_window_layers = max_window_layers
|
||||||
|
self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
|
||||||
|
self.rope_scaling = rope_scaling
|
||||||
|
|
||||||
|
# MoE configs
|
||||||
|
self.num_experts = num_experts
|
||||||
|
self.num_shared_experts = num_shared_experts
|
||||||
|
self.num_experts_per_tok = num_experts_per_tok
|
||||||
|
self.norm_topk_prob = norm_topk_prob
|
||||||
|
self.moe_intermediate_size = moe_intermediate_size
|
||||||
|
self.first_k_dense_replace = first_k_dense_replace
|
||||||
|
self.output_router_logits = output_router_logits
|
||||||
|
|
||||||
|
super().__init__(pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||||
6
generation_config.json
Normal file
6
generation_config.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"eos_token_id": 126081,
|
||||||
|
"pad_token_id": 126081,
|
||||||
|
"transformers_version": "4.40.0"
|
||||||
|
}
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b83afd0fe022e3552a74da807869c865b734c88523da27a4510a32f085d9ec33
|
||||||
|
size 10000012352
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d5ccecf15d89900ba376bff70aac7266a90320205b549337883e0fbd0e7386a6
|
||||||
|
size 9997403496
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:b72c9cb181ce3ce4ba02afa6d78b8ef82f85c8b7da85b7cd472aa58517ef937c
|
||||||
|
size 9995576736
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:f5b3384dc88a60dbc5a20a7fe6617163374b1adbdd5a07c6669192e185a34abc
|
||||||
|
size 3611653272
|
||||||
5610
model.safetensors.index.json
Normal file
5610
model.safetensors.index.json
Normal file
File diff suppressed because it is too large
Load Diff
1443
modeling_bailing_moe.py
Normal file
1443
modeling_bailing_moe.py
Normal file
File diff suppressed because it is too large
Load Diff
38
special_tokens_map.json
Normal file
38
special_tokens_map.json
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
{
|
||||||
|
"additional_special_tokens": [
|
||||||
|
"<role>",
|
||||||
|
"</role>",
|
||||||
|
"<|arithmetic_start|>",
|
||||||
|
"<|arithmetic_end|>",
|
||||||
|
"<|number_start|>",
|
||||||
|
"<|number_end|>"
|
||||||
|
],
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|startoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"cls_token": {
|
||||||
|
"content": "[CLS]",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<|endoftext|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
1068
tokenization_bailing.py
Normal file
1068
tokenization_bailing.py
Normal file
File diff suppressed because it is too large
Load Diff
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:0e35981f02e539be62d3fa40b489a2cd13c4869301e6f419a5ff7167c2d4a056
|
||||||
|
size 6098787
|
||||||
2155
tokenizer_config.json
Normal file
2155
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user