初始化项目,由ModelHub XC社区提供模型
Model: tartuNLP/Apertus-EstLLM-8B-Instruct-0326 Source: Original Platform
This commit is contained in:
36
.gitattributes
vendored
Normal file
36
.gitattributes
vendored
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.model filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||||
|
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||||
|
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||||
|
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
||||||
253
README.md
Normal file
253
README.md
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
---
|
||||||
|
library_name: transformers
|
||||||
|
language:
|
||||||
|
- et
|
||||||
|
- en
|
||||||
|
base_model:
|
||||||
|
- tartuNLP/Apertus-EstLLM-8B-Instruct-1125
|
||||||
|
- swiss-ai/Apertus-8B-Instruct-2509
|
||||||
|
tags:
|
||||||
|
- merge
|
||||||
|
license: apache-2.0
|
||||||
|
---
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
# Apertus EstLLM 8B 0326 Instruct
|
||||||
|
|
||||||
|
`Llama-3.1-EstLLM-8B-Instruct-0326` is obtained by applying the chat-vector merge approach
|
||||||
|
to [tartuNLP/Apertus-EstLLM-8B-Instruct-1125](https://huggingface.co/tartuNLP/Apertus-EstLLM-8B-Instruct-1125).
|
||||||
|
|
||||||
|
## Use with transformers
|
||||||
|
|
||||||
|
```python
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import torch
|
||||||
|
|
||||||
|
model_name = "tartuNLP/Apertus-EstLLM-8B-Instruct-0326"
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
dtype="auto",
|
||||||
|
device_map="auto"
|
||||||
|
)
|
||||||
|
|
||||||
|
# to use on apple silicon, load the following way
|
||||||
|
# model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
# model_name,
|
||||||
|
# dtype=torch.float16,
|
||||||
|
# device_map="mps",
|
||||||
|
# )
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": "Kas sa räägid eesti keelt?"}
|
||||||
|
]
|
||||||
|
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True
|
||||||
|
)
|
||||||
|
|
||||||
|
model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
generated_ids = model.generate(
|
||||||
|
**model_inputs,
|
||||||
|
max_new_tokens=128,
|
||||||
|
do_sample=True,
|
||||||
|
temperature=0.4,
|
||||||
|
# specify eos token to stop at the end of the assistant response
|
||||||
|
eos_token_id=tokenizer.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# generated_ids include the input tokens as well, so we only decode new tokens
|
||||||
|
response = tokenizer.decode(
|
||||||
|
generated_ids[0][model_inputs["input_ids"].shape[1]:],
|
||||||
|
skip_special_tokens=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
|
||||||
|
## Logits-based
|
||||||
|
|
||||||
|
Scores for logits-based evaluation benchmarks are available on the [EuroEval](https://euroeval.com/leaderboards/Monolingual/estonian/) leaderboard.
|
||||||
|
|
||||||
|
## Generative
|
||||||
|
|
||||||
|
Every benchmark in this category is treated as a *generative* problem, and thus the evaluation is performed on the model responses obtained with 0 temperature (not logits).
|
||||||
|
The top scores are higlighted with **bold**. Second best scores are highlighted with **_italic bold_**. Rows are sorted in descending order based on the number of parameters of models (not scores).
|
||||||
|
The test set is used for evaluation of each dataset unless noted otherwise.
|
||||||
|
|
||||||
|
Note that _all models are evaluated with the same prompt template_ for comparability, meaning that the scores do not necessarily represent each model's best possible
|
||||||
|
performance. This is especially the case for `deepseek-ai/DeepSeek-V3-0324` on some of the benchmarks.
|
||||||
|
|
||||||
|
Only models of comparable size are evaluated on benchmarks in English.
|
||||||
|
|
||||||
|
### Instruction-following
|
||||||
|
|
||||||
|
#### Estonian
|
||||||
|
|
||||||
|
Instruction level strict accuracy is reported for IFEval-et.
|
||||||
|
|
||||||
|
| Model (# parameters ↓) | [IFEval-et](https://huggingface.co/datasets/tartuNLP/ifeval_et) |
|
||||||
|
|-------|-----------------------------------|
|
||||||
|
| moonshotai/Kimi-K2-Instruct | **0.7891** |
|
||||||
|
| deepseek-ai/DeepSeek-V3.2 | 0.7221 |
|
||||||
|
| deepseek-ai/DeepSeek-V3-0324 | 0.7171 |
|
||||||
|
| mistralai/Mistral-Large-3-675B-Instruct-2512 | 0.7097 |
|
||||||
|
| meta-llama/Llama-3.1-405B-Instruct | 0.7159 |
|
||||||
|
| meta-llama/Llama-3.3-70B-Instruct | **_0.7705_** |
|
||||||
|
| Qwen/Qwen2.5-72B-Instruct | 0.7407 |
|
||||||
|
| google/gemma-3-27b-it | 0.7655 |
|
||||||
|
| google/gemma-3-12b-it | 0.7556 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.5571 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.5397 |
|
||||||
|
| mistralai/Ministral-3-8B-Instruct-2512 | 0.4888 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326** | 0.5608 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125 | 0.4665 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509| 0.5484 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | 0.3797 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | 0.6141 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.5174 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.5195 |
|
||||||
|
| tartuNLP/Llammas | 0.3524 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | 0.4988 |
|
||||||
|
| CohereLabs/tiny-aya-global | 0.6687 |
|
||||||
|
|
||||||
|
|
||||||
|
#### English
|
||||||
|
|
||||||
|
Instruction level strict accuracy is reported for IFEval-en.
|
||||||
|
|
||||||
|
|
||||||
|
| Model (# parameters ↓) | [IFEval-en](https://huggingface.co/datasets/tartuNLP/ifeval_en) |
|
||||||
|
|-------|-----------------------------------|
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.7564 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.7004 |
|
||||||
|
| mistralai/Ministral-3-8B-Instruct-2512 | 0.6845 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326** | 0.7089 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125 | 0.6638 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509 | 0.7808 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | _**0.8106**_ |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | **0.8173** |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.7527 |
|
||||||
|
| tartuNLP/Llammas | 0.4373 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.3289 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | 0.7954 |
|
||||||
|
|
||||||
|
### Multiple Choice
|
||||||
|
|
||||||
|
All datasets except Winogrande-et are evaluated in 0-shot mode. Winogrande-et is evaluated in 3-shot mode. Exact match accuracy is reported for every dataset.
|
||||||
|
|
||||||
|
#### Estonian Language Competence
|
||||||
|
|
||||||
|
| Model (# parameters ↓) | [Grammar-et](https://huggingface.co/datasets/TalTechNLP/grammar_et)| [Inflection-et](https://huggingface.co/datasets/TalTechNLP/inflection_et)| [Word-Meanings-et](https://huggingface.co/datasets/TalTechNLP/word_meanings_et) |
|
||||||
|
|-------|------|------|--------|
|
||||||
|
| moonshotai/Kimi-K2-Instruct | **0.916** | 0.6458 | **0.9689** |
|
||||||
|
| deepseek-ai/DeepSeek-V3.2 | 0.781 | 0.6891 | 0.8134 |
|
||||||
|
| deepseek-ai/DeepSeek-V3-0324 | 0.364 | 0 | 0 |
|
||||||
|
| mistralai/Mistral-Large-3-675B-Instruct-2512 | 0.796 | _**0.8355**_ | 0.9488 |
|
||||||
|
| meta-llama/Llama-3.1-405B-Instruct | 0.818 | **0.9089** | 0.9438 |
|
||||||
|
| meta-llama/Llama-3.3-70B-Instruct | 0.797 | 0.6421 | 0.9408 |
|
||||||
|
| Qwen/Qwen2.5-72B-Instruct | 0.694 | 0.5208 | 0.9057 |
|
||||||
|
| google/gemma-3-27b-it | 0.817 | 0.5934 | 0.9529 |
|
||||||
|
| google/gemma-3-12b-it | 0.789 | 0.4227 | 0.9318 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.644 | 0.4466 | 0.9288 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.764 | 0.367 | 0.9258 |
|
||||||
|
| mistralai/Ministral-3-8B-Instruct-2512 | 0.562 | 0.4833 | 0.8395 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326**| 0.713 | 0.4326 | 0.9438 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125| 0.646 | 0.421 | 0.9178 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509 | 0.512 | 0.3662 | 0.9027 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | 0.657 | 0.4165 | 0.8335 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | _**0.8310**_ | 0.5777 | _**0.9619**_ |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.692 | 0.5188 | 0.9569 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.594 | 0.2668 | 0.8084 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | 0.598 | 0.4136 | 0.7984 |
|
||||||
|
| tartuNLP/Llammas | 0.529 | 0.2289 | 0.5326 |
|
||||||
|
| CohereLabs/tiny-aya-global | 0.563 | 0.3221 | 0.8455 |
|
||||||
|
|
||||||
|
#### Knowledge and Reasoning (Estonian)
|
||||||
|
|
||||||
|
|
||||||
|
| Model (# parameters ↓) | [Winogrande-et](https://huggingface.co/datasets/tartuNLP/winogrande_et) | [Trivia-et](https://huggingface.co/datasets/TalTechNLP/trivia_et) | [Exam-et](https://huggingface.co/datasets/TalTechNLP/exam_et) | [GlobalPIQA-et](https://huggingface.co/datasets/mrlbenchmarks/global-piqa-nonparallel/viewer/ekk_latn)| [TruthfulQA-et](https://huggingface.co/datasets/LumiOpen/opengpt-x_truthfulqax/viewer/mc_ET) |
|
||||||
|
|-------|-----------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|-------------------------------------------|
|
||||||
|
| moonshotai/Kimi-K2-Instruct | **0.8138** | 0.4225 | **0.8414** | **0.79** | **0.7136** |
|
||||||
|
| deepseek-ai/DeepSeek-V3.2 | 0.4805 | 0.38 | 0.614 | 0.7 | 0.5863 |
|
||||||
|
| deepseek-ai/DeepSeek-V3-0324 | **_0.8042_** | 0.27 | 0.1221 | 0.04 | 0.2093 |
|
||||||
|
| mistralai/Mistral-Large-3-675B-Instruct-2512 | 0.7487 | 0.4275 | 0.7931 | _**0.73**_ | 0.6854 |
|
||||||
|
| meta-llama/Llama-3.1-405B-Instruct |0.7878 | **0.4713** | _**0.8309**_ | 0.58 | _**0.7001**_ |
|
||||||
|
| meta-llama/Llama-3.3-70B-Instruct |0.7397 | 0.3875 | 0.7652 | 0.58 | 0.6255 |
|
||||||
|
| Qwen/Qwen2.5-72B-Instruct | 0.7227 | 0.315 | 0.7162 | 0.65 | 0.6683 |
|
||||||
|
| google/gemma-3-27b-it | 0.7510 | 0.325 | 0.7751 | 0.71 | 0.5814 |
|
||||||
|
| google/gemma-3-12b-it | 0.6712 | 0.3237 | 0.7069 | 0.54 | 0.3158 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.5195 | 0.375 | 0.6097 | 0.52 | 0.399 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.5846 | 0.3738 | 0.5589 | 0.55 | 0.2889 |
|
||||||
|
| mistralai/Ministral-3-8B-Instruct-2512 | 0.5812 | 0.3125 | 0.5012 | 0.48 | 0.3525 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326** | 0.5976 | 0.35 | 0.6022 | 0.64 | 0.4296 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125 | 0.5467 | 0.3575 | 0.5651 | 0.63 | 0.3696 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509 | 0.5105 | 0.345 | 0.552 | 0.59 | 0.366 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | 0.5399 | 0.2888 | 0.5 | 0.54 | 0.437 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | 0.6440 | _**0.4288**_ | 0.6332 | 0.68 | 0.3794 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.5812 | 0.425 | 0.5093 | 0.63 | 0.3525 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.2878 | 0.2875 | 0.3556 | 0.55 | 0.3011 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | 0.5473 | 0.2938 | 0.4913 | 0.57 | 0.4113 |
|
||||||
|
| tartuNLP/Llammas | 0.5037 | 0.2838 | 0.3649 | 0.01 | 0.2032 |
|
||||||
|
| CohereLabs/tiny-aya-global | 0.5603 | 0.31 | 0.5638 | 0.52 | 0.3782 |
|
||||||
|
|
||||||
|
#### Knowledge and Reasoning (English)
|
||||||
|
|
||||||
|
|
||||||
|
| Model (# parameters ↓) | [Winogrande](https://huggingface.co/datasets/allenai/winogrande) | [GlobalPIQA-en](https://huggingface.co/datasets/mrlbenchmarks/global-piqa-nonparallel/viewer/eng_latn) | [TruthfulQA](https://huggingface.co/datasets/truthfulqa/truthful_qa) | [MMLU-Redux](https://huggingface.co/datasets/edinburgh-dawg/mmlu-redux-2.0) | [GSM8K](https://huggingface.co/datasets/openai/gsm8k) |
|
||||||
|
|-------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|-----------------------------------|
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.5546 | 0.58 |0.4614 | 0.6334 | 0.4139 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.5059 | 0.58 | 0.2962 | 0.5741 | 0.5944 |
|
||||||
|
| mistralai/Ministral-3-8B-Instruct-2512 | _**0.6503**_ | _**0.77**_ | 0.519 | _**0.7418**_ | 0.3927 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326** | 0.5699 | 0.69 | 0.4174 | 0.5946 | 0.5588 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125 | 0.5348 | 0.56 | 0.3647 | 0.5944 | 0.5277 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509 | 0.5133 | 0.73 | 0.3831 | 0.6099 | 0.5936 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | 0.5625 | 0.76 | _**0.5239**_ | 0.6959 | 0.7710 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | 0.6118 | 0.76 | 0.3635 | 0.6606 | _**0.7726**_ |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.6084 | 0.71 | 0.366 | 0.6388 | 0.7202 |
|
||||||
|
| tartuNLP/Llammas | 0.498 | 0 | 0.1971 | 0.3417 | 0.1456 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.4029 | 0.63 | 0.2717 | 0.5180 | 0.0076 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | **0.6627** | **0.83** | **0.5875** | **0.7555** | **0.7862** |
|
||||||
|
|
||||||
|
|
||||||
|
### Translation
|
||||||
|
|
||||||
|
#### English to Estonian
|
||||||
|
|
||||||
|
| Model | [wmt24pp](https://huggingface.co/datasets/google/wmt24pp) (BLEU ↑) |
|
||||||
|
|-------|---------|
|
||||||
|
| BSC-LT/salamandraTA-7b-instruct | 0.2713 |
|
||||||
|
| **tartuNLP/Apertus-EstLLM-8B-Instruct-0326** | 0.2676 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-1125 | 0.2635 |
|
||||||
|
| tartuNLP/Llama-3.1-EstLLM-8B-Instruct-0825 | 0.264 |
|
||||||
|
| tartuNLP/Apertus-EstLLM-8B-Instruct-1125| 0.2609 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct | 0.2602 |
|
||||||
|
| utter-project/EuroLLM-9B-Instruct-2512 | 0.2567 |
|
||||||
|
| swiss-ai/Apertus-8B-Instruct-2509 | 0.2372 |
|
||||||
|
| tartuNLP/Llammas | 0.1472 |
|
||||||
|
| meta-llama/Llama-3.1-8B-Instruct | 0.1406 |
|
||||||
|
| BSC-LT/salamandra-7b-instruct | 0.1201 |
|
||||||
|
| Qwen/Qwen2.5-7B-Instruct | 0.0476 |
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
```
|
||||||
|
@misc{dorkin2026estllmenhancingestoniancapabilities,
|
||||||
|
title={{EstLLM: Enhancing Estonian Capabilities in Multilingual LLMs via Continued Pretraining and Post-Training}},
|
||||||
|
author={Aleksei Dorkin and Taido Purason and Emil Kalbaliyev and Hele-Andra Kuulmets and Marii Ojastu and Mark Fišel and Tanel Alumäe and Eleri Aedmaa and Krister Kruusmaa and Kairit Sirts},
|
||||||
|
year={2026},
|
||||||
|
eprint={2603.02041},
|
||||||
|
archivePrefix={arXiv},
|
||||||
|
primaryClass={cs.CL},
|
||||||
|
url={https://arxiv.org/abs/2603.02041},
|
||||||
|
}
|
||||||
BIN
assets/logo-sinine.png
Normal file
BIN
assets/logo-sinine.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
337
chat_template.jinja
Normal file
337
chat_template.jinja
Normal file
@@ -0,0 +1,337 @@
|
|||||||
|
{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
|
||||||
|
{%- if param_spec.type == "array" -%}
|
||||||
|
{%- if param_spec['items'] -%}
|
||||||
|
{%- if param_spec['items']['type'] == "string" -%}
|
||||||
|
{{- "string[]" }}
|
||||||
|
{%- elif param_spec['items']['type'] == "number" -%}
|
||||||
|
{{- "number[]" }}
|
||||||
|
{%- elif param_spec['items']['type'] == "integer" -%}
|
||||||
|
{{- "number[]" }}
|
||||||
|
{%- elif param_spec['items']['type'] == "boolean" -%}
|
||||||
|
{{- "boolean[]" }}
|
||||||
|
{%- else -%}
|
||||||
|
{%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
|
||||||
|
{%- if inner_type == "object | object" or inner_type|length > 50 -%}
|
||||||
|
{{- "any[]" }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- inner_type + "[]" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if param_spec.nullable -%}
|
||||||
|
{{- " | null" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- "any[]" }}
|
||||||
|
{%- if param_spec.nullable -%}
|
||||||
|
{{- " | null" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
|
||||||
|
{#- Handle array of types like ["object", "object"] from Union[dict, list] #}
|
||||||
|
{%- if param_spec.type | length > 1 -%}
|
||||||
|
{{- param_spec.type | join(" | ") }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- param_spec.type[0] }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif param_spec.oneOf -%}
|
||||||
|
{#- Handle oneOf schemas - check for complex unions and fallback to any #}
|
||||||
|
{%- set has_object_variants = false -%}
|
||||||
|
{%- for variant in param_spec.oneOf -%}
|
||||||
|
{%- if variant.type == "object" -%}
|
||||||
|
{%- set has_object_variants = true -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- if has_object_variants and param_spec.oneOf|length > 1 -%}
|
||||||
|
{{- "any" }}
|
||||||
|
{%- else -%}
|
||||||
|
{%- for variant in param_spec.oneOf -%}
|
||||||
|
{{- render_typescript_type(variant, required_params) -}}
|
||||||
|
{%- if variant.description %}
|
||||||
|
{{- "// " + variant.description }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if variant.default is defined %}
|
||||||
|
{{ "// default: " + variant.default|tojson }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- " | " }}
|
||||||
|
{% endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif param_spec.type == "string" -%}
|
||||||
|
{%- if param_spec.enum -%}
|
||||||
|
{{- '"' + param_spec.enum|join('" | "') + '"' -}}
|
||||||
|
{%- else -%}
|
||||||
|
{{- "string" }}
|
||||||
|
{%- if param_spec.nullable %}
|
||||||
|
{{- " | null" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif param_spec.type == "number" -%}
|
||||||
|
{{- "number" }}
|
||||||
|
{%- elif param_spec.type == "integer" -%}
|
||||||
|
{{- "number" }}
|
||||||
|
{%- elif param_spec.type == "boolean" -%}
|
||||||
|
{{- "boolean" }}
|
||||||
|
{%- elif param_spec.type == "object" -%}
|
||||||
|
{%- if param_spec.properties -%}
|
||||||
|
{{- "{\n" }}
|
||||||
|
{%- for prop_name, prop_spec in param_spec.properties.items() -%}
|
||||||
|
{{- prop_name -}}
|
||||||
|
{%- if prop_name not in (param_spec.required or []) -%}
|
||||||
|
{{- "?" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{- ": " }}
|
||||||
|
{{ render_typescript_type(prop_spec, param_spec.required or []) }}
|
||||||
|
{%- if not loop.last -%}
|
||||||
|
{{-", " }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{- "}" }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- "object" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- "any" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endmacro -%}
|
||||||
|
|
||||||
|
{%- macro render_tools(tools) -%}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "// " + tool.description + "\n" }}
|
||||||
|
{{- "type "+ tool.name + " = " }}
|
||||||
|
{%- if tool.parameters and tool.parameters.properties %}
|
||||||
|
{{- "(_: {\n" }}
|
||||||
|
{%- for param_name, param_spec in tool.parameters.properties.items() %}
|
||||||
|
{%- if param_spec.description %}
|
||||||
|
{{- "// " + param_spec.description + "\n" }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- param_name }}
|
||||||
|
{%- if param_name not in (tool.parameters.required or []) -%}
|
||||||
|
{{- "?" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{- ": " }}
|
||||||
|
{{- render_typescript_type(param_spec, tool.parameters.required or []) }}
|
||||||
|
{%- if param_spec.default is defined -%}
|
||||||
|
{%- if param_spec.enum %}
|
||||||
|
{{- ", // default: " + param_spec.default }}
|
||||||
|
{%- elif param_spec.oneOf %}
|
||||||
|
{{- "// default: " + param_spec.default }}
|
||||||
|
{%- else %}
|
||||||
|
{{- ", // default: " + param_spec.default|tojson }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if not loop.last %}
|
||||||
|
{{- ",\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "}) => any;" }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- "() => any;" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if not loop.last -%}
|
||||||
|
{{- "\n" }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endmacro -%}
|
||||||
|
|
||||||
|
{{ bos_token }}
|
||||||
|
|
||||||
|
{%- set system_token = '<|system_start|>' -%}
|
||||||
|
{%- set end_system_token = '<|system_end|>' -%}
|
||||||
|
{%- set developer_token = '<|developer_start|>' -%}
|
||||||
|
{%- set end_developer_token = '<|developer_end|>' -%}
|
||||||
|
{%- set user_token = '<|user_start|>' -%}
|
||||||
|
{%- set end_user_token = '<|user_end|>' -%}
|
||||||
|
{%- set assistant_token = '<|assistant_start|>' -%}
|
||||||
|
{%- set end_assistant_token = '<|assistant_end|>' -%}
|
||||||
|
{%- set inner_token = '<|inner_prefix|>' -%}
|
||||||
|
{%- set outer_token = '<|inner_suffix|>' -%}
|
||||||
|
{%- set tool_calls_token = '<|tools_prefix|>' -%}
|
||||||
|
{%- set end_tool_calls_token = '<|tools_suffix|>' -%}
|
||||||
|
{%- set image_token = '<|image|>' -%}
|
||||||
|
|
||||||
|
{%- set ns = namespace(in_assistant=false, in_tool=false, in_inner=false, waiting_for_tool_outputs=false, assistant_format=none) -%}
|
||||||
|
|
||||||
|
{%- if messages and messages[0].role == 'system' -%}
|
||||||
|
{%- if "content" in messages[0] -%}
|
||||||
|
{%- if messages[0].content is string -%}
|
||||||
|
{{ system_token + messages[0].content + end_system_token }}
|
||||||
|
{%- elif messages[0].content is mapping and "text" in messages[0].content -%}
|
||||||
|
{{ system_token + messages[0].content.text + end_system_token }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid system message") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid system message") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- set loop_messages = messages[1:] -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{ system_token + 'You are Apertus, a helpful assistant created by the SwissAI initiative.\nKnowledge cutoff: 2024-04\nCurrent date: ' + strftime_now('%Y-%m-%d') + end_system_token }}
|
||||||
|
{%- set loop_messages = messages -%}
|
||||||
|
{%- endif -%}
|
||||||
|
|
||||||
|
{{ developer_token + 'Deliberation: ' }}
|
||||||
|
{%- if enable_thinking is defined and enable_thinking -%}
|
||||||
|
{{ 'enabled\n' }}
|
||||||
|
{%- else -%}
|
||||||
|
{{ 'disabled\n' }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if tools is defined and tools -%}
|
||||||
|
{{ 'Tool Capabilities:\n' + render_tools(tools) }}
|
||||||
|
{%- else -%}
|
||||||
|
{{ 'Tool Capabilities: disabled' }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ end_developer_token }}
|
||||||
|
|
||||||
|
{%- for message in loop_messages -%}
|
||||||
|
{%- if message.role == 'user' -%}
|
||||||
|
{%- set ns.in_inner = false -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- set ns.in_tool = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if ns.in_assistant -%}
|
||||||
|
{{ end_assistant_token }}
|
||||||
|
{%- set ns.in_assistant = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "content" in message -%}
|
||||||
|
{{ user_token }}
|
||||||
|
{%- if message.content is string -%}
|
||||||
|
{{ message.content }}
|
||||||
|
{%- elif message.content is mapping and "parts" in message.content -%}
|
||||||
|
{%- set parts = message.content.parts -%}
|
||||||
|
{%- for part in parts -%}
|
||||||
|
{%- if part.type == "text" -%}
|
||||||
|
{{ part.text }}
|
||||||
|
{%- elif part.type == "image" -%}
|
||||||
|
{{ image_token }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid user part: " + part.type) -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid user message: " + message.role) -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ end_user_token }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif message.role == 'assistant' -%}
|
||||||
|
{%- if not ns.in_assistant -%}
|
||||||
|
{{ assistant_token }}
|
||||||
|
{%- set ns.in_assistant = true -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "content" in message -%}
|
||||||
|
{%- if message.content is string and (ns.assistant_format is none or ns.assistant_format == "string") -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- set ns.in_tool = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- set ns.assistant_format = "string" -%}
|
||||||
|
{{ message.content }}
|
||||||
|
{%- elif message.content is mapping and "blocks" in message.content and (ns.assistant_format is none or ns.assistant_format == "mapping") -%}
|
||||||
|
{%- set ns.assistant_format = "mapping" -%}
|
||||||
|
{%- set blocks = message.content.blocks -%}
|
||||||
|
{%- for block in blocks -%}
|
||||||
|
{%- if block.type == 'thoughts' -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- set ns.in_tool = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if not ns.in_inner -%}
|
||||||
|
{%- set ns.in_inner = true -%}
|
||||||
|
{{ inner_token }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ block.text }}
|
||||||
|
{%- elif block.type == 'tool_calls' -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- set ns.in_tool = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if ns.in_inner and not loop.first and block.calls|length == 1 and block.calls[0].name == 'display_answers' -%}
|
||||||
|
{%- set ns.in_inner = false -%}
|
||||||
|
{{ outer_token }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ tool_calls_token + '[' }}
|
||||||
|
{%- for tool_call in block.calls -%}
|
||||||
|
{{- '{"' + tool_call.name + '": ' + tool_call.arguments + '}' }}
|
||||||
|
{%- if not loop.last -%}
|
||||||
|
{{- ", " }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ ']' + end_tool_calls_token }}
|
||||||
|
{%- set ns.waiting_for_tool_outputs = true -%}
|
||||||
|
{%- elif block.type == 'tool_outputs' -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{- raise_exception("Cannot have both tool outputs as separate messages and tool outputs as blocks") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ '[' }}
|
||||||
|
{%- for tool_output in block.outputs -%}
|
||||||
|
{{- tool_output.output }}
|
||||||
|
{%- if not loop.last -%}
|
||||||
|
{{- ", " }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{- ']' }}
|
||||||
|
{%- set ns.waiting_for_tool_outputs = false -%}
|
||||||
|
{%- elif block.type == 'response' -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- set ns.in_tool = false -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if (not loop.first and ns.in_inner) or (ns.in_assistant and ns.in_inner) -%}
|
||||||
|
{%- set ns.in_inner = false -%}
|
||||||
|
{{ outer_token }}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ block.text }}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid assistant block type: " + block.type) -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid assistant content") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid assistant message") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if "tool_calls" in message and message.tool_calls -%}
|
||||||
|
{{ tool_calls_token + '[' }}
|
||||||
|
{%- for tool_call in message.tool_calls -%}
|
||||||
|
{%- if tool_call.type == 'function' -%}
|
||||||
|
{%- set function = tool_call.function -%}
|
||||||
|
{{- '{"' + function.name + '": ' + function.arguments + '}' }}
|
||||||
|
{%- if not loop.last -%}
|
||||||
|
{{- ", " }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid tool call type: " + tool_call.type) -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{{ ']' + end_tool_calls_token }}
|
||||||
|
{%- set ns.waiting_for_tool_outputs = true -%}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- elif message.role == 'tool' -%}
|
||||||
|
{%- if not ns.in_assistant -%}
|
||||||
|
{{- raise_exception("Tool message outside of assistant") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if not ns.in_tool -%}
|
||||||
|
{{ '[' }}
|
||||||
|
{%- set ns.in_tool = true -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{ ", "}}
|
||||||
|
{%- endif -%}
|
||||||
|
{{ message.content }}
|
||||||
|
{%- set ns.waiting_for_tool_outputs = false -%}
|
||||||
|
{%- else -%}
|
||||||
|
{{- raise_exception("Invalid message role") -}}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- endfor -%}
|
||||||
|
{%- if ns.in_tool -%}
|
||||||
|
{{ ']' }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if ns.in_assistant and not (continue_assistant_message is defined and continue_assistant_message) and not ns.waiting_for_tool_outputs -%}
|
||||||
|
{{ end_assistant_token }}
|
||||||
|
{%- endif -%}
|
||||||
|
{%- if add_generation_prompt -%}
|
||||||
|
{{ assistant_token }}
|
||||||
|
{%- endif -%}
|
||||||
38
config.json
Normal file
38
config.json
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
{
|
||||||
|
"architectures": [
|
||||||
|
"ApertusForCausalLM"
|
||||||
|
],
|
||||||
|
"attention_bias": false,
|
||||||
|
"attention_dropout": 0.0,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"eos_token_id": 68,
|
||||||
|
"hidden_act": "xielu",
|
||||||
|
"hidden_dropout": 0.0,
|
||||||
|
"hidden_size": 4096,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 21504,
|
||||||
|
"max_position_embeddings": 65536,
|
||||||
|
"mlp_bias": false,
|
||||||
|
"model_type": "apertus",
|
||||||
|
"num_attention_heads": 32,
|
||||||
|
"num_hidden_layers": 32,
|
||||||
|
"num_key_value_heads": 8,
|
||||||
|
"pad_token_id": 3,
|
||||||
|
"post_norm": false,
|
||||||
|
"qk_norm": true,
|
||||||
|
"rms_norm_eps": 1e-05,
|
||||||
|
"rope_scaling": {
|
||||||
|
"factor": 8.0,
|
||||||
|
"high_freq_factor": 4.0,
|
||||||
|
"low_freq_factor": 1.0,
|
||||||
|
"original_max_position_embeddings": 8192,
|
||||||
|
"rope_type": "llama3",
|
||||||
|
"type": "llama3"
|
||||||
|
},
|
||||||
|
"rope_theta": 12000000,
|
||||||
|
"tie_word_embeddings": false,
|
||||||
|
"transformers_version": "4.57.1",
|
||||||
|
"use_cache": false,
|
||||||
|
"vocab_size": 131072
|
||||||
|
}
|
||||||
11
generation_config.json
Normal file
11
generation_config.json
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
{
|
||||||
|
"_from_model_config": true,
|
||||||
|
"bos_token_id": 1,
|
||||||
|
"eos_token_id": [
|
||||||
|
2,
|
||||||
|
68,
|
||||||
|
72
|
||||||
|
],
|
||||||
|
"pad_token_id": 3,
|
||||||
|
"transformers_version": "4.57.1"
|
||||||
|
}
|
||||||
3
model-00001-of-00004.safetensors
Normal file
3
model-00001-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:174a0305fe59da00f3f6422438a98f783890935d906ebe607c219c3e5dc0adc9
|
||||||
|
size 4999776656
|
||||||
3
model-00002-of-00004.safetensors
Normal file
3
model-00002-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:bc87d197cda0fd413165ef75d5138aaa724ea7784d5551ca9311d916811c3620
|
||||||
|
size 4882374192
|
||||||
3
model-00003-of-00004.safetensors
Normal file
3
model-00003-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:83d860d9d63f353d5f22c44102129c9e45317810f6bd634933543cab83320098
|
||||||
|
size 4974647808
|
||||||
3
model-00004-of-00004.safetensors
Normal file
3
model-00004-of-00004.safetensors
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:f4ccfcc62aa610b56f7943a1c4e701661a8d755f077bfeef3d259847eb2f8745
|
||||||
|
size 1249928160
|
||||||
459
model.safetensors.index.json
Normal file
459
model.safetensors.index.json
Normal file
@@ -0,0 +1,459 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"total_parameters": 8053338176,
|
||||||
|
"total_size": 16106676480
|
||||||
|
},
|
||||||
|
"weight_map": {
|
||||||
|
"lm_head.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.10.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.2.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.20.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.21.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.3.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.30.attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.feedforward_layernorm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.act_fn.alpha_n": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.act_fn.alpha_p": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.act_fn.beta": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.act_fn.eps": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.feedforward_layernorm.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.act_fn.alpha_n": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.act_fn.alpha_p": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.act_fn.beta": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.act_fn.eps": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
||||||
|
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
||||||
|
"model.layers.4.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.feedforward_layernorm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.act_fn.alpha_n": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.act_fn.alpha_p": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.act_fn.beta": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.act_fn.eps": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
||||||
|
"model.layers.9.attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.feedforward_layernorm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.act_fn.alpha_n": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.act_fn.alpha_p": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.act_fn.beta": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.act_fn.eps": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
||||||
|
"model.norm.weight": "model-00004-of-00004.safetensors"
|
||||||
|
}
|
||||||
|
}
|
||||||
30
special_tokens_map.json
Normal file
30
special_tokens_map.json
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
{
|
||||||
|
"bos_token": {
|
||||||
|
"content": "<s>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|assistant_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": {
|
||||||
|
"content": "<pad>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"unk_token": {
|
||||||
|
"content": "<unk>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
}
|
||||||
|
}
|
||||||
3
tokenizer.json
Normal file
3
tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
version https://git-lfs.github.com/spec/v1
|
||||||
|
oid sha256:010095abf1dc6f52d4867584e7b3f0d4eece854593ae310220ec7782dd1b0a66
|
||||||
|
size 17078474
|
||||||
8020
tokenizer_config.json
Normal file
8020
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user