Update README.md
This commit is contained in:
39
.gitattributes
vendored
39
.gitattributes
vendored
@@ -1,47 +1,44 @@
|
|||||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
*.model filter=lfs diff=lfs merge=lfs -text
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.db* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.ark* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
|
||||||
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.gguf* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.ggml filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.llamafile* filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.pt2 filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
|
||||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Comparison.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Llama-3.1-Tulu-3.1-8B-core-evals-overtime.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Llama-3.1-Tulu-3.1-8B-learning-curve.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Llama-3.1-Tulu-3.1-8B-other-evals-overtime.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
|||||||
3
Comparison.png
Normal file
3
Comparison.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d1df7ec1dcc615cff1782636ca5bed4d68ae77800c425c8dd7b3906f8a5faa75
|
||||||
|
size 132298
|
||||||
3
Llama-3.1-Tulu-3.1-8B-core-evals-overtime.png
Normal file
3
Llama-3.1-Tulu-3.1-8B-core-evals-overtime.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:e317a714799e2d0c18918cb5663d902e76199a8b11ca1412ffc4108dfc0debd3
|
||||||
|
size 176576
|
||||||
3
Llama-3.1-Tulu-3.1-8B-learning-curve.png
Normal file
3
Llama-3.1-Tulu-3.1-8B-learning-curve.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:41ef1ab1da57ecf7d8619b2e635ebc75a883b8c47e64417899f19e2acd2977b8
|
||||||
|
size 316296
|
||||||
3
Llama-3.1-Tulu-3.1-8B-other-evals-overtime.png
Normal file
3
Llama-3.1-Tulu-3.1-8B-other-evals-overtime.png
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:126c2a8c2440f59f051225047fc342e5caa53757b50a3f5e3c55e7e87f8f1be2
|
||||||
|
size 513593
|
||||||
353
README.md
353
README.md
@@ -1,47 +1,318 @@
|
|||||||
---
|
---
|
||||||
license: Apache License 2.0
|
license: llama3.1
|
||||||
|
language:
|
||||||
#model-type:
|
- en
|
||||||
##如 gpt、phi、llama、chatglm、baichuan 等
|
pipeline_tag: text-generation
|
||||||
#- gpt
|
datasets:
|
||||||
|
- allenai/RLVR-GSM-MATH-IF-Mixed-Constraints
|
||||||
#domain:
|
base_model:
|
||||||
##如 nlp、cv、audio、multi-modal
|
- allenai/Llama-3.1-Tulu-3-8B-DPO
|
||||||
#- nlp
|
library_name: transformers
|
||||||
|
|
||||||
#language:
|
|
||||||
##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
|
|
||||||
#- cn
|
|
||||||
|
|
||||||
#metrics:
|
|
||||||
##如 CIDEr、Blue、ROUGE 等
|
|
||||||
#- CIDEr
|
|
||||||
|
|
||||||
#tags:
|
|
||||||
##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
|
|
||||||
#- pretrained
|
|
||||||
|
|
||||||
#tools:
|
|
||||||
##如 vllm、fastchat、llamacpp、AdaSeq 等
|
|
||||||
#- vllm
|
|
||||||
---
|
---
|
||||||
### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
|
|
||||||
#### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型
|
|
||||||
|
|
||||||
SDK下载
|
<img src="https://huggingface.co/datasets/allenai/blog-images/resolve/main/tulu3/Tulu3-logo.png" alt="Tulu 3 banner" width="800" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
|
||||||
|
|
||||||
|
# Llama-3.1-Tulu-3.1-8B
|
||||||
|
|
||||||
|
Tülu 3 is a leading instruction following model family, offering a post-training package with fully open-source data, code, and recipes designed to serve as a comprehensive guide for modern techniques.
|
||||||
|
This is one step of a bigger process to training fully open-source models, like our [OLMo](https://allenai.org/olmo) models.
|
||||||
|
Tülu 3 is designed for state-of-the-art performance on a diversity of tasks in addition to chat, such as MATH, GSM8K, and IFEval.
|
||||||
|
|
||||||
|
**Version 3.1 update**: The new version of our Tülu model is from an improvement only in the final RL stage of training.
|
||||||
|
We switched from PPO to GRPO (no reward model) and did further hyperparameter tuning to achieve substantial performance improvements across the board over the original Tülu 3 8B model,
|
||||||
|
as shown in the comparison below:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
- **Model type:** A model trained on a mix of publicly available, synthetic and human-created datasets.
|
||||||
|
- **Language(s) (NLP):** Primarily English
|
||||||
|
- **License:** Llama 3.1 Community License Agreement
|
||||||
|
- **Finetuned from model:** allenai/Llama-3.1-Tulu-3-8B-DPO
|
||||||
|
|
||||||
|
### Model Sources
|
||||||
|
|
||||||
|
- **Training Repository:** https://github.com/allenai/open-instruct
|
||||||
|
- **Eval Repository:** https://github.com/allenai/olmes
|
||||||
|
- **Paper:** https://arxiv.org/abs/2411.15124
|
||||||
|
- **Demo:** https://playground.allenai.org/
|
||||||
|
|
||||||
|
### Model Family
|
||||||
|
|
||||||
|
| **Stage** | **Llama 3.1 8B (New)** | **Llama 3.1 70B** |
|
||||||
|
|----------------------|----------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
|
||||||
|
| **Base Model** | [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) | [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B) |
|
||||||
|
| **SFT** | [allenai/Llama-3.1-Tulu-3-8B-SFT](https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-SFT) | [allenai/Llama-3.1-Tulu-3-70B-SFT](https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B-SFT) |
|
||||||
|
| **DPO** | [allenai/Llama-3.1-Tulu-3-8B-DPO](https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-DPO) | [allenai/Llama-3.1-Tulu-3-70B-DPO](https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B-DPO) |
|
||||||
|
| **Final Models (RLVR)** | [allenai/Llama-3.1-Tulu-3.1-8B](https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B) | [allenai/Llama-3.1-Tulu-3-70B](https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B) |
|
||||||
|
| **Reward Model (RM)**| None with GRPO | [allenai/Llama-3.1-Tulu-3-8B-RM](https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-RM) |
|
||||||
|
|
||||||
|
| **Stage** | **Llama 3.1 405B** |
|
||||||
|
|-----------|-------------------|
|
||||||
|
| **Base Model** | [meta-llama/llama-3.1-405B](https://huggingface.co/meta-llama/llama-3.1-405B) |
|
||||||
|
| **SFT** | [allenai/llama-3.1-Tulu-3-405B-SFT](https://huggingface.co/allenai/llama-3.1-Tulu-3-405B-SFT) |
|
||||||
|
| **DPO** | [allenai/llama-3.1-Tulu-3-405B-DPO](https://huggingface.co/allenai/llama-3.1-Tulu-3-405B-DPO) |
|
||||||
|
| **Final Model (RLVR)** | [allenai/llama-3.1-Tulu-3-405B](https://huggingface.co/allenai/llama-3.1-Tulu-3-405B) |
|
||||||
|
| **Reward Model (RM)**| (Same as 70B)
|
||||||
|
|
||||||
|
|
||||||
|
## Using the model
|
||||||
|
|
||||||
|
### Loading with HuggingFace
|
||||||
|
|
||||||
|
To load the model with HuggingFace, use the following snippet:
|
||||||
|
```
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
|
||||||
|
tulu_model = AutoModelForCausalLM.from_pretrained("allenai/Llama-3.1-Tulu-3.1-8B")
|
||||||
|
```
|
||||||
|
|
||||||
|
### VLLM
|
||||||
|
|
||||||
|
As a Llama base model, the model can be easily served with:
|
||||||
|
```
|
||||||
|
vllm serve allenai/Llama-3.1-Tulu-3.1-8B
|
||||||
|
```
|
||||||
|
Note that given the long chat template of Llama, you may want to use `--max_model_len=8192`.
|
||||||
|
|
||||||
|
### Chat template
|
||||||
|
|
||||||
|
The chat template for our models is formatted as:
|
||||||
|
```
|
||||||
|
<|user|>\nHow are you doing?\n<|assistant|>\nI'm just a computer program, so I don't have feelings, but I'm functioning as expected. How can I assist you today?<|endoftext|>
|
||||||
|
```
|
||||||
|
Or with new lines expanded:
|
||||||
|
```
|
||||||
|
<|user|>
|
||||||
|
How are you doing?
|
||||||
|
<|assistant|>
|
||||||
|
I'm just a computer program, so I don't have feelings, but I'm functioning as expected. How can I assist you today?<|endoftext|>
|
||||||
|
```
|
||||||
|
It is embedded within the tokenizer as well, for `tokenizer.apply_chat_template`.
|
||||||
|
|
||||||
|
### System prompt
|
||||||
|
|
||||||
|
In Ai2 demos, we use this system prompt by default:
|
||||||
|
```
|
||||||
|
You are Tulu 3, a helpful and harmless AI Assistant built by the Allen Institute for AI.
|
||||||
|
```
|
||||||
|
The model has not been trained with a specific system prompt in mind.
|
||||||
|
|
||||||
|
### Bias, Risks, and Limitations
|
||||||
|
|
||||||
|
The Tülu3 models have limited safety training, but are not deployed automatically with in-the-loop filtering of responses like ChatGPT, so the model can produce problematic outputs (especially when prompted to do so).
|
||||||
|
It is also unknown what the size and composition of the corpus was used to train the base Llama 3.1 models, however it is likely to have included a mix of Web data and technical sources like books and code.
|
||||||
|
See the Falcon 180B model card for an example of this.
|
||||||
|
|
||||||
|
|
||||||
|
## Performance
|
||||||
|
|
||||||
|
| Benchmark (eval) | Tülu 3 SFT 8B | Tülu 3 DPO 8B | Tülu 3 8B | **Tülu 3.1 8B (NEW)** | Llama 3.1 8B Instruct | Qwen 2.5 7B Instruct | Magpie 8B | Gemma 2 9B Instruct | Ministral 8B Instruct |
|
||||||
|
|---------------------------------|--------------|--------------|-----------|------------|------------------------|----------------------|-----------|---------------------|-----------------------|
|
||||||
|
| **Avg.** | 60.4 | 64.4 | 64.8 | 66.3 | 62.2 | **66.5** | 44.7 | 55.2 | 58.3 |
|
||||||
|
| **MMLU (0 shot, CoT)** | 65.9 | 68.7 | 68.2 | 69.5 | 71.2 | **76.6** | 62.0 | 74.6 | 68.5 |
|
||||||
|
| **PopQA (15 shot)** | **29.3** | 29.3 | 29.1 | 30.2 | 20.2 | 18.1 | 22.5 | 28.3 | 20.2 |
|
||||||
|
| **TruthfulQA (6 shot)** | 46.8 | 56.1 | 55.0 | 59.9 | 55.1 | **63.1** | 57.0 | 61.4 | 55.5 |
|
||||||
|
| **BigBenchHard (3 shot, CoT)** | **67.9** | 65.8 | 66.0 | 68.9 | 62.8 | 70.2 | 0.9 | 2.5 | 56.2 |
|
||||||
|
| **DROP (3 shot)** | 61.3 | 62.5 | 62.6 | **63.9** | 61.5 | 54.4 | 49.4 | 58.8 | 56.2 |
|
||||||
|
| **MATH (4 shot CoT, Flex)** | 31.5 | 42.0 |43.7 | 47.8 | 42.5 | **69.9** | 5.1 | 29.8 | 40.0 |
|
||||||
|
| **GSM8K (8 shot, CoT)** | 76.2 | 84.3 | 87.6 | **90.0** | 83.4 | 83.8 | 61.2 | 79.7 | 80.0 |
|
||||||
|
| **HumanEval (pass@10)** | 86.2 | 83.9 | 83.9 | 84.8 | 86.3 | **93.1** | 75.4 | 71.7 | 91.0 |
|
||||||
|
| **HumanEval+ (pass@10)** | 81.4 | 78.6 | 79.2 | 80.4 | 82.9 | **89.7** | 69.1 | 67.0 | 88.5 |
|
||||||
|
| **IFEval (prompt loose)** | 72.8 | 81.1 | 82.4 | **83.9** | 80.6 | 74.7 | 38.8 | 69.9 | 56.4 |
|
||||||
|
| **AlpacaEval 2 (LC % win)** | 12.4 | 33.5 | 34.5 | 34.9 | 24.2 | 29.0 | **49.0** | 43.7 | 31.4 |
|
||||||
|
| **Safety (6 task avg.)** | **93.1** | 87.2 | 85.5 | 81.2 | 75.2 | 75.0 | 46.4 | 75.5 | 56.2 |
|
||||||
|
|
||||||
|
|
||||||
|
*Note, see the updated version of the paper for the latest, fixed evaluations that improve scores for models such as Qwen 2.5 Instruct.*
|
||||||
|
|
||||||
|
| Benchmark (eval) | Tülu 3 70B SFT | Tülu 3 DPO 70B | Tülu 3 70B | Llama 3.1 70B Instruct | Qwen 2.5 72B Instruct | Hermes 3 Llama 3.1 70B | Nemotron Llama 3.1 70B |
|
||||||
|
|---------------------------------|-----------------|-----------------|-------------|-------------------------|-----------------------|------------------------|-------------------------|
|
||||||
|
| **Avg.** | 72.6 | 75.9 | **76.0** | 73.4 | 71.5 | 68.3 | 65.5 |
|
||||||
|
| **MMLU (0 shot, CoT)** | 78.9 | 83.3 | 83.1 | 85.3 | **85.5** | 80.4 | 83.8 |
|
||||||
|
| **PopQA (15 shot)** | **48.6** | 46.3 | 46.5 | 46.4 | 30.6 | 48.1 | 36.4 |
|
||||||
|
| **TruthfulQA (6 shot)** | 55.7 | 67.9 | 67.6 | 66.8 | **69.9** | 66.5 | 62.6 |
|
||||||
|
| **BigBenchHard (3 shot, CoT)** | **82.7** | 81.8 | 82.0 | 73.8 | 67.2 | 82.1 | 0.7 |
|
||||||
|
| **DROP (3 shot)** | **77.2** | 74.1 | 74.3 | 77.0 | 34.2 | 73.2 | 68.8 |
|
||||||
|
| **MATH (4 shot CoT, Flex)** | 53.7 | 62.3 | 63.0 | 56.4 | **74.3** | 41.9 | 55.0 |
|
||||||
|
| **GSM8K (8 shot, CoT)** | 91.1 | 93.5 | 93.5 | **93.7** | 89.5 | 90.0 | 84.7 |
|
||||||
|
| **HumanEval (pass@10)** | 92.9 | 92.4 | 92.4 | 93.6 | 94.0 | 89.6 | **94.1** |
|
||||||
|
| **HumanEval+ (pass@10)** | 87.3 | 88.4 | 88.0 | 89.5 | **90.8** | 85.9 | 85.5 |
|
||||||
|
| **IFEval (prompt loose)** | 82.1 | 82.6 | 83.2 | **88.0** | 87.6 | 76.0 | 79.9 |
|
||||||
|
| **AlpacaEval 2 (LC % win)** | 26.3 | 49.6 | 49.8 | 33.4 | 47.7 | 28.4 | **66.1** |
|
||||||
|
| **Safety (6 task avg.)** | **94.4** | 89.0 | 88.3 | 76.5 | 87.0 | 57.9 | 69.0 |
|
||||||
|
|
||||||
|
| Benchmark (eval) | Tülu 3 405B SFT | Tülu 3 405B DPO | Tülu 3 405B | Llama 3.1 405B Instruct | Nous Hermes 3 405B | Deepseek V3 | GPT 4o (11-24) |
|
||||||
|
|-----------------|----------------|----------------|-------------|------------------------|-------------------|-------------|----------------|
|
||||||
|
| **Avg w/o Safety** | 76.3 | 79.0 | 80.0 | 78.1 | 74.4 | 79.0 | **80.5** |
|
||||||
|
| **Avg w/ Safety** | 77.5 | 79.6 | 80.7 | 79.0 | 73.5 | 75.9 | **81.6** |
|
||||||
|
| **MMLU (5 shot, CoT)** | 84.4 | 86.6 | 87.0 | **88.0** | 84.9 | 82.1 | 87.9 |
|
||||||
|
| **PopQA (3 shot)** | **55.7** | 55.4 | 55.5 | 52.9 | 54.2 | 44.9 | 53.6 |
|
||||||
|
| **BigBenchHard (0 shot, CoT)** | 88.0 | 88.8 | 88.6 | 87.1 | 87.7 | **89.5** | 83.3 |
|
||||||
|
| **MATH (4 shot, Flex)** | 63.4 | 59.9 | 67.3 | 66.6 | 58.4 | **72.5** | 68.8 |
|
||||||
|
| **GSM8K (8 shot, CoT)** | 93.6 | 94.2 | **95.5** | 95.4 | 92.7 | 94.1 | 91.7 |
|
||||||
|
| **HumanEval (pass@10)** | 95.7 | **97.2** | 95.9 | 95.9 | 92.3 | 94.6 | 97.0 |
|
||||||
|
| **HumanEval+ (pass@10)** | 93.3 | **93.9** | 92.9 | 90.3 | 86.9 | 91.6 | 92.7 |
|
||||||
|
| **IFEval (prompt loose)** | 82.4 | 85.0 | 86.0 | **88.4** | 81.9 | 88.0 | 84.8 |
|
||||||
|
| **AlpacaEval 2 (LC % win)** | 30.4 | 49.8 | 51.4 | 38.5 | 30.2 | 53.5 | **65.0** |
|
||||||
|
| **Safety (6 task avg.)** | 87.7 | 85.5 | 86.7 | 86.8 | 65.8 | 72.2 | **90.9** |
|
||||||
|
|
||||||
|
|
||||||
|
## Hyperparamters
|
||||||
|
|
||||||
|
GRPO settings for RLVR:
|
||||||
|
- **Learning Rate**: 5 × 10⁻⁷
|
||||||
|
- **Discount Factor (gamma)**: 1.0
|
||||||
|
- **Mini-batches (N_mb)**: 2
|
||||||
|
- **PPO-style Update Iteration (K)**: 1
|
||||||
|
- **Clipping Coefficient (epsilon)**: 0.2
|
||||||
|
- **Gradient Norm Threshold**: 1.0
|
||||||
|
- **Learning Rate Schedule**: Constant
|
||||||
|
- **Generation Temperature**: 1.0
|
||||||
|
- **Number of Samples per Prompt**: 16
|
||||||
|
- **Number of Unique Prompts per Training Iteration**: 48
|
||||||
|
- **Batch Size (effective)**: 48 * 16 = 768
|
||||||
|
- **Max Token Length**: 2,048
|
||||||
|
- **Max Prompt Token Length**: 2,048
|
||||||
|
- **Penalty Reward Value for Responses without an EOS Token**: 0.0
|
||||||
|
- **Response Length**: 2,048
|
||||||
|
- **Total Episodes**: 10,000,000 (the actual checkpoint is at episode 1474560)
|
||||||
|
- **KL penalty coefficient (beta)**: 0.01
|
||||||
|
- **Warm up ratio (omega)**: 0.0
|
||||||
|
|
||||||
|
|
||||||
|
## Learning curves
|
||||||
|
|
||||||
|
Below is the training curves for Llama-3.1-Tulu-3.1-8B:
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
Below are the core eval scores over steps for Llama-3.1-Tulu-3.1-8B (note we took step `1920` as the final checkpoint, corresponding to episode `1,474,560`):
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
Below are the other eval scores over steps for Llama-3.1-Tulu-3.1-8B (the codex evals had a bug and earlier scores are not shown):
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
## Reproduction command
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
#安装ModelScope
|
# clone and check out commit
|
||||||
pip install modelscope
|
git clone https://github.com/allenai/open-instruct.git
|
||||||
```
|
git checkout 3f37c29ddc97d2c108a7658692d2d2c3708ef182
|
||||||
```python
|
|
||||||
#SDK模型下载
|
# run my exact command for launching exps
|
||||||
from modelscope import snapshot_download
|
for learning_rate in 5e-7; do
|
||||||
model_dir = snapshot_download('allenai/Llama-3.1-Tulu-3.1-8B')
|
for beta in 0.01; do
|
||||||
```
|
for nspp in 16; do
|
||||||
Git下载
|
for m in half-m ; do
|
||||||
```
|
for kl_estimator in kl3; do
|
||||||
#Git模型下载
|
local_rollout_batch_size=8
|
||||||
git clone https://www.modelscope.cn/allenai/Llama-3.1-Tulu-3.1-8B.git
|
# `half-m` is the same as setting number of mini-batches to be 2.
|
||||||
|
if [ $m == "half-m" ]; then
|
||||||
|
local_mini_batch_size=$(($local_rollout_batch_size * $nspp / 2))
|
||||||
|
else
|
||||||
|
local_mini_batch_size=$(($local_rollout_batch_size * $nspp))
|
||||||
|
fi
|
||||||
|
exp_name="0204_lr_scan_grpo_math_lr_${learning_rate}_${kl_estimator}_${beta}_${nspp}_${m}_${RANDOM}"
|
||||||
|
echo $exp_name:
|
||||||
|
echo --- local_mini_batch_size=$local_mini_batch_size
|
||||||
|
echo --- num_gradient_updates=$(($local_rollout_batch_size * $nspp / $local_mini_batch_size))
|
||||||
|
python open_instruct/grpo_vllm_thread_ray_gtrl.py \
|
||||||
|
--exp_name $exp_name \
|
||||||
|
--beta $beta \
|
||||||
|
--local_mini_batch_size $local_mini_batch_size \
|
||||||
|
--number_samples_per_prompt $nspp \
|
||||||
|
--output_dir output/$exp_name \
|
||||||
|
--local_rollout_batch_size $local_rollout_batch_size \
|
||||||
|
--kl_estimator $kl_estimator \
|
||||||
|
--learning_rate $learning_rate \
|
||||||
|
--dataset_mixer_list allenai/RLVR-GSM-MATH-IF-Mixed-Constraints 1.0 \
|
||||||
|
--dataset_mixer_list_splits train \
|
||||||
|
--dataset_mixer_eval_list allenai/RLVR-GSM-MATH-IF-Mixed-Constraints 16 \
|
||||||
|
--dataset_mixer_eval_list_splits train \
|
||||||
|
--max_token_length 2048 \
|
||||||
|
--max_prompt_token_length 2048 \
|
||||||
|
--response_length 2048 \
|
||||||
|
--model_name_or_path allenai/Llama-3.1-Tulu-3-8B-DPO \
|
||||||
|
--non_stop_penalty \
|
||||||
|
--stop_token eos \
|
||||||
|
--temperature 1.0 \
|
||||||
|
--ground_truths_key ground_truth \
|
||||||
|
--chat_template_name tulu \
|
||||||
|
--sft_messages_key messages \
|
||||||
|
--total_episodes 10000000 \
|
||||||
|
--penalty_reward_value 0.0 \
|
||||||
|
--deepspeed_stage 2 \
|
||||||
|
--per_device_train_batch_size 2 \
|
||||||
|
--local_rollout_forward_batch_size 2 \
|
||||||
|
--actor_num_gpus_per_node 6 \
|
||||||
|
--num_epochs 1 \
|
||||||
|
--vllm_tensor_parallel_size 2 \
|
||||||
|
--lr_scheduler_type constant \
|
||||||
|
--apply_verifiable_reward true \
|
||||||
|
--seed 1 \
|
||||||
|
--num_evals 30 \
|
||||||
|
--save_freq 40 \
|
||||||
|
--reward_model_multiplier 0.0 \
|
||||||
|
--gradient_checkpointing \
|
||||||
|
--with_tracking
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
```
|
```
|
||||||
|
|
||||||
<p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p>
|
## License and use
|
||||||
|
|
||||||
|
All Llama 3.1 Tülu3 models are released under Meta's [Llama 3.1 Community License Agreement](https://www.llama.com/llama3_1/license/).
|
||||||
|
Llama 3.1 is licensed under the Llama 3.1 Community License, Copyright © Meta Platforms, Inc.
|
||||||
|
Tülu3 is intended for research and educational use.
|
||||||
|
For more information, please see our [Responsible Use Guidelines](https://allenai.org/responsible-use).
|
||||||
|
|
||||||
|
The models have been fine-tuned using a dataset mix with outputs generated from third party models and are subject to additional terms:
|
||||||
|
[Gemma Terms of Use](https://ai.google.dev/gemma/terms) and [Qwen License Agreement](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/LICENSE) (models were improved using Qwen 2.5).
|
||||||
|
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If Tülu3 or any of the related materials were helpful to your work, please cite:
|
||||||
|
```
|
||||||
|
@article{lambert2024tulu3,
|
||||||
|
title = {Tülu 3: Pushing Frontiers in Open Language Model Post-Training},
|
||||||
|
author = {
|
||||||
|
Nathan Lambert and
|
||||||
|
Jacob Morrison and
|
||||||
|
Valentina Pyatkin and
|
||||||
|
Shengyi Huang and
|
||||||
|
Hamish Ivison and
|
||||||
|
Faeze Brahman and
|
||||||
|
Lester James V. Miranda and
|
||||||
|
Alisa Liu and
|
||||||
|
Nouha Dziri and
|
||||||
|
Shane Lyu and
|
||||||
|
Yuling Gu and
|
||||||
|
Saumya Malik and
|
||||||
|
Victoria Graf and
|
||||||
|
Jena D. Hwang and
|
||||||
|
Jiangjiang Yang and
|
||||||
|
Ronan Le Bras and
|
||||||
|
Oyvind Tafjord and
|
||||||
|
Chris Wilhelm and
|
||||||
|
Luca Soldaini and
|
||||||
|
Noah A. Smith and
|
||||||
|
Yizhong Wang and
|
||||||
|
Pradeep Dasigi and
|
||||||
|
Hannaneh Hajishirzi
|
||||||
|
},
|
||||||
|
year = {2024},
|
||||||
|
email = {tulu@allenai.org}
|
||||||
|
}
|
||||||
|
```
|
||||||
36
config.json
Normal file
36
config.json
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"_name_or_path": "allenai/Llama-3.1-Tulu-3-8B-DPO",
|
||||||
|
"architectures": [
|
||||||
|
"LlamaForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"eos_token_id": 128001,
|
||||||
|
"head_dim": 128,
|
||||||
|
"hidden_act": "silu",
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 14336,
|
||||||
|
"max_position_embeddings": 131072,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"model_type": "llama",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 32,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pretraining_tp": 1,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": {
|
||||||
|
"factor": 8.0,
|
||||||
|
"high_freq_factor": 4.0,
|
||||||
|
"low_freq_factor": 1.0,
|
||||||
|
"original_max_position_embeddings": 8192,
|
||||||
|
"rope_type": "llama3"
|
||||||
|
},
|
||||||
|
"rope_theta": 500000.0,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"torch_dtype": "bfloat16",
|
||||||
|
"transformers_version": "4.47.1",
|
||||||
|
"use_cache": false,
|
||||||
|
"vocab_size": 128264
|
||||||
|
}
|
||||||
1
configuration.json
Normal file
1
configuration.json
Normal file
@@ -0,0 +1 @@
|
|||||||
|
{"framework": "pytorch", "task": "text-generation", "allow_remote": true}
|
||||||
9
generation_config.json
Normal file
9
generation_config.json
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 128000,
|
||||||
|
"do_sample": true,
|
||||||
|
"eos_token_id": 128001,
|
||||||
|
"temperature": 0.6,
|
||||||
|
"top_p": 0.9,
|
||||||
|
"transformers_version": "4.47.1"
|
||||||
|
}
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:d33a8d4b121c60fd8120b01dc5a5e64161a558aaa0d816fec9c6ef784f1f71ae
|
||||||
|
size 4976764208
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a2eadd476720bdef2c8ee4068e78f3772330429b93bfed42158ff543b92dfcf0
|
||||||
|
size 4999802720
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:84e4d95f33f4a177652d71260c75dbfe179c9920c3f00e512ee7b0e1e370c9ea
|
||||||
|
size 4915916176
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:a775c18784fae2b3961cf0395b6c34ee4c2dfe7ccb1f93251ef66642d0f73602
|
||||||
|
size 1168204344
|
||||||
298
model.safetensors.index.json
Normal file
298
model.safetensors.index.json
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_size": 16060653568
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.norm.weight": "model-00004-of-00004.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
23
special_tokens_map.json
Normal file
23
special_tokens_map.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<|begin_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|end_of_text|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:9400df98529060210393c40f08cb127f7c0df584338b3fbfdba8cf82a33c1ade
|
||||||
|
size 17210102
|
||||||
2072
tokenizer_config.json
Normal file
2072
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user