初始化项目,由ModelHub XC社区提供模型
Model: MahmoudIbrahim/Summary-0.1 Source: Original Platform
This commit is contained in:
35
.gitattributes
vendored
Normal file
35
.gitattributes
vendored
Normal file
@@ -0,0 +1,35 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.npy filter=lfs diff=lfs merge=lfs -text
|
||||
*.npz filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pickle filter=lfs diff=lfs merge=lfs -text
|
||||
*.pkl filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.wasm filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zst filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
58
README.md
Normal file
58
README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
---
|
||||
base_model: MahmoudIbrahim/Summary-0.1
|
||||
library_name: transformers
|
||||
model_name: Summary-0.1
|
||||
tags:
|
||||
- generated_from_trainer
|
||||
- trl
|
||||
- sft
|
||||
licence: license
|
||||
---
|
||||
|
||||
# Model Card for Summary-0.1
|
||||
|
||||
This model is a fine-tuned version of [MahmoudIbrahim/Summary-0.1](https://huggingface.co/MahmoudIbrahim/Summary-0.1).
|
||||
It has been trained using [TRL](https://github.com/huggingface/trl).
|
||||
|
||||
## Quick start
|
||||
|
||||
```python
|
||||
from transformers import pipeline
|
||||
|
||||
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
||||
generator = pipeline("text-generation", model="MahmoudIbrahim/Summary-0.1", device="cuda")
|
||||
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
||||
print(output["generated_text"])
|
||||
```
|
||||
|
||||
## Training procedure
|
||||
|
||||
|
||||
|
||||
|
||||
This model was trained with SFT.
|
||||
|
||||
### Framework versions
|
||||
|
||||
- TRL: 0.20.0
|
||||
- Transformers: 4.54.1
|
||||
- Pytorch: 2.10.0+cu128
|
||||
- Datasets: 4.0.0
|
||||
- Tokenizers: 0.21.4
|
||||
|
||||
## Citations
|
||||
|
||||
|
||||
|
||||
Cite TRL as:
|
||||
|
||||
```bibtex
|
||||
@misc{vonwerra2022trl,
|
||||
title = {{TRL: Transformer Reinforcement Learning}},
|
||||
author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
|
||||
year = 2020,
|
||||
journal = {GitHub repository},
|
||||
publisher = {GitHub},
|
||||
howpublished = {\url{https://github.com/huggingface/trl}}
|
||||
}
|
||||
```
|
||||
37
chat_template.jinja
Normal file
37
chat_template.jinja
Normal file
@@ -0,0 +1,37 @@
|
||||
{{- bos_token -}}
|
||||
{%- set system_prompt = "" -%}
|
||||
{%- set ns = namespace(system_prompt="") -%}
|
||||
{%- if messages[0]["role"] == "system" -%}
|
||||
{%- set ns.system_prompt = messages[0]["content"] -%}
|
||||
{%- set messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
{%- if tools -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
|
||||
{%- for tool in tools -%}
|
||||
{%- if tool is not string -%}
|
||||
{%- set tool = tool | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + tool -%}
|
||||
{%- if not loop.last -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
|
||||
{%- endif -%}
|
||||
{%- if ns.system_prompt -%}
|
||||
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
|
||||
{%- endif -%}
|
||||
{%- for message in messages -%}
|
||||
{{- "<|im_start|>" + message["role"] + "\n" -}}
|
||||
{%- set content = message["content"] -%}
|
||||
{%- if content is not string -%}
|
||||
{%- set content = content | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- if message["role"] == "tool" -%}
|
||||
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
|
||||
{%- endif -%}
|
||||
{{- content + "<|im_end|>\n" -}}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- "<|im_start|>assistant\n" -}}
|
||||
{%- endif -%}
|
||||
57
config.json
Normal file
57
config.json
Normal file
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"architectures": [
|
||||
"Lfm2ForCausalLM"
|
||||
],
|
||||
"block_auto_adjust_ff_dim": true,
|
||||
"block_dim": 1024,
|
||||
"block_ff_dim": 6656,
|
||||
"block_ffn_dim_multiplier": 1.0,
|
||||
"block_mlp_init_scale": 1.0,
|
||||
"block_multiple_of": 256,
|
||||
"block_norm_eps": 1e-05,
|
||||
"block_out_init_scale": 1.0,
|
||||
"block_use_swiglu": true,
|
||||
"block_use_xavier_init": true,
|
||||
"bos_token_id": 1,
|
||||
"conv_L_cache": 3,
|
||||
"conv_bias": false,
|
||||
"conv_dim": 1024,
|
||||
"conv_dim_out": 1024,
|
||||
"conv_use_xavier_init": true,
|
||||
"eos_token_id": 7,
|
||||
"hidden_size": 1024,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 6656,
|
||||
"layer_types": [
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv"
|
||||
],
|
||||
"max_position_embeddings": 128000,
|
||||
"model_type": "lfm2",
|
||||
"norm_eps": 1e-05,
|
||||
"num_attention_heads": 16,
|
||||
"num_heads": 16,
|
||||
"num_hidden_layers": 16,
|
||||
"num_key_value_heads": 8,
|
||||
"pad_token_id": 0,
|
||||
"rope_theta": 1000000.0,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.54.1",
|
||||
"use_cache": true,
|
||||
"use_pos_enc": true,
|
||||
"vocab_size": 65536
|
||||
}
|
||||
7
generation_config.json
Normal file
7
generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"eos_token_id": 7,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.54.1"
|
||||
}
|
||||
37
last-checkpoint/chat_template.jinja
Normal file
37
last-checkpoint/chat_template.jinja
Normal file
@@ -0,0 +1,37 @@
|
||||
{{- bos_token -}}
|
||||
{%- set system_prompt = "" -%}
|
||||
{%- set ns = namespace(system_prompt="") -%}
|
||||
{%- if messages[0]["role"] == "system" -%}
|
||||
{%- set ns.system_prompt = messages[0]["content"] -%}
|
||||
{%- set messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
{%- if tools -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
|
||||
{%- for tool in tools -%}
|
||||
{%- if tool is not string -%}
|
||||
{%- set tool = tool | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + tool -%}
|
||||
{%- if not loop.last -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
|
||||
{%- endif -%}
|
||||
{%- if ns.system_prompt -%}
|
||||
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
|
||||
{%- endif -%}
|
||||
{%- for message in messages -%}
|
||||
{{- "<|im_start|>" + message["role"] + "\n" -}}
|
||||
{%- set content = message["content"] -%}
|
||||
{%- if content is not string -%}
|
||||
{%- set content = content | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- if message["role"] == "tool" -%}
|
||||
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
|
||||
{%- endif -%}
|
||||
{{- content + "<|im_end|>\n" -}}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- "<|im_start|>assistant\n" -}}
|
||||
{%- endif -%}
|
||||
57
last-checkpoint/config.json
Normal file
57
last-checkpoint/config.json
Normal file
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"architectures": [
|
||||
"Lfm2ForCausalLM"
|
||||
],
|
||||
"block_auto_adjust_ff_dim": true,
|
||||
"block_dim": 1024,
|
||||
"block_ff_dim": 6656,
|
||||
"block_ffn_dim_multiplier": 1.0,
|
||||
"block_mlp_init_scale": 1.0,
|
||||
"block_multiple_of": 256,
|
||||
"block_norm_eps": 1e-05,
|
||||
"block_out_init_scale": 1.0,
|
||||
"block_use_swiglu": true,
|
||||
"block_use_xavier_init": true,
|
||||
"bos_token_id": 1,
|
||||
"conv_L_cache": 3,
|
||||
"conv_bias": false,
|
||||
"conv_dim": 1024,
|
||||
"conv_dim_out": 1024,
|
||||
"conv_use_xavier_init": true,
|
||||
"eos_token_id": 7,
|
||||
"hidden_size": 1024,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 6656,
|
||||
"layer_types": [
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv",
|
||||
"full_attention",
|
||||
"conv"
|
||||
],
|
||||
"max_position_embeddings": 128000,
|
||||
"model_type": "lfm2",
|
||||
"norm_eps": 1e-05,
|
||||
"num_attention_heads": 16,
|
||||
"num_heads": 16,
|
||||
"num_hidden_layers": 16,
|
||||
"num_key_value_heads": 8,
|
||||
"pad_token_id": 0,
|
||||
"rope_theta": 1000000.0,
|
||||
"torch_dtype": "bfloat16",
|
||||
"transformers_version": "4.54.1",
|
||||
"use_cache": true,
|
||||
"use_pos_enc": true,
|
||||
"vocab_size": 65536
|
||||
}
|
||||
7
last-checkpoint/generation_config.json
Normal file
7
last-checkpoint/generation_config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 1,
|
||||
"eos_token_id": 7,
|
||||
"pad_token_id": 0,
|
||||
"transformers_version": "4.54.1"
|
||||
}
|
||||
3
last-checkpoint/model.safetensors
Normal file
3
last-checkpoint/model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
|
||||
size 708984464
|
||||
3
last-checkpoint/optimizer.pt
Normal file
3
last-checkpoint/optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6ff715c5c4b7242e62e8b0f87aa52e4c61b186ff4bcf9bd6ae6f996717938936
|
||||
size 1418063051
|
||||
3
last-checkpoint/rng_state.pth
Normal file
3
last-checkpoint/rng_state.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
|
||||
size 14645
|
||||
3
last-checkpoint/scheduler.pt
Normal file
3
last-checkpoint/scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:84bf56f1dd07d671cce095f4098c57df5ec6e431abdf62fc0db9c16e7ca046e1
|
||||
size 1465
|
||||
23
last-checkpoint/special_tokens_map.json
Normal file
23
last-checkpoint/special_tokens_map.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<|startoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
323812
last-checkpoint/tokenizer.json
Normal file
323812
last-checkpoint/tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
4078
last-checkpoint/tokenizer_config.json
Normal file
4078
last-checkpoint/tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
514
last-checkpoint/trainer_state.json
Normal file
514
last-checkpoint/trainer_state.json
Normal file
@@ -0,0 +1,514 @@
|
||||
{
|
||||
"best_global_step": 334,
|
||||
"best_metric": 1.941367506980896,
|
||||
"best_model_checkpoint": "./Summary-0.1/checkpoint-334",
|
||||
"epoch": 3.0,
|
||||
"eval_steps": 500,
|
||||
"global_step": 501,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"epoch": 0.059880239520958084,
|
||||
"grad_norm": 38.75,
|
||||
"learning_rate": 4.5e-06,
|
||||
"loss": 2.7202,
|
||||
"mean_token_accuracy": 0.45009988248348237,
|
||||
"num_tokens": 29469.0,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"epoch": 0.11976047904191617,
|
||||
"grad_norm": 9.25,
|
||||
"learning_rate": 9.5e-06,
|
||||
"loss": 2.2626,
|
||||
"mean_token_accuracy": 0.5219378590583801,
|
||||
"num_tokens": 58579.0,
|
||||
"step": 20
|
||||
},
|
||||
{
|
||||
"epoch": 0.17964071856287425,
|
||||
"grad_norm": 6.375,
|
||||
"learning_rate": 1.45e-05,
|
||||
"loss": 2.2181,
|
||||
"mean_token_accuracy": 0.518680065870285,
|
||||
"num_tokens": 88193.0,
|
||||
"step": 30
|
||||
},
|
||||
{
|
||||
"epoch": 0.23952095808383234,
|
||||
"grad_norm": 5.9375,
|
||||
"learning_rate": 1.9500000000000003e-05,
|
||||
"loss": 2.0667,
|
||||
"mean_token_accuracy": 0.5430938720703125,
|
||||
"num_tokens": 116933.0,
|
||||
"step": 40
|
||||
},
|
||||
{
|
||||
"epoch": 0.2994011976047904,
|
||||
"grad_norm": 5.65625,
|
||||
"learning_rate": 2.45e-05,
|
||||
"loss": 2.1906,
|
||||
"mean_token_accuracy": 0.5175440430641174,
|
||||
"num_tokens": 146513.0,
|
||||
"step": 50
|
||||
},
|
||||
{
|
||||
"epoch": 0.3592814371257485,
|
||||
"grad_norm": 5.9375,
|
||||
"learning_rate": 2.95e-05,
|
||||
"loss": 2.092,
|
||||
"mean_token_accuracy": 0.5363078862428665,
|
||||
"num_tokens": 175698.0,
|
||||
"step": 60
|
||||
},
|
||||
{
|
||||
"epoch": 0.41916167664670656,
|
||||
"grad_norm": 4.9375,
|
||||
"learning_rate": 3.45e-05,
|
||||
"loss": 1.9358,
|
||||
"mean_token_accuracy": 0.568104338645935,
|
||||
"num_tokens": 202607.0,
|
||||
"step": 70
|
||||
},
|
||||
{
|
||||
"epoch": 0.47904191616766467,
|
||||
"grad_norm": 5.1875,
|
||||
"learning_rate": 3.9500000000000005e-05,
|
||||
"loss": 2.0201,
|
||||
"mean_token_accuracy": 0.550568813085556,
|
||||
"num_tokens": 231199.0,
|
||||
"step": 80
|
||||
},
|
||||
{
|
||||
"epoch": 0.5389221556886228,
|
||||
"grad_norm": 5.03125,
|
||||
"learning_rate": 4.4500000000000004e-05,
|
||||
"loss": 1.9076,
|
||||
"mean_token_accuracy": 0.5696590662002563,
|
||||
"num_tokens": 260064.0,
|
||||
"step": 90
|
||||
},
|
||||
{
|
||||
"epoch": 0.5988023952095808,
|
||||
"grad_norm": 5.03125,
|
||||
"learning_rate": 4.9500000000000004e-05,
|
||||
"loss": 1.9292,
|
||||
"mean_token_accuracy": 0.5686947405338287,
|
||||
"num_tokens": 288635.0,
|
||||
"step": 100
|
||||
},
|
||||
{
|
||||
"epoch": 0.6586826347305389,
|
||||
"grad_norm": 5.40625,
|
||||
"learning_rate": 4.8076923076923084e-05,
|
||||
"loss": 2.0754,
|
||||
"mean_token_accuracy": 0.5396111845970154,
|
||||
"num_tokens": 315994.0,
|
||||
"step": 110
|
||||
},
|
||||
{
|
||||
"epoch": 0.718562874251497,
|
||||
"grad_norm": 4.78125,
|
||||
"learning_rate": 4.594017094017094e-05,
|
||||
"loss": 1.942,
|
||||
"mean_token_accuracy": 0.567721825838089,
|
||||
"num_tokens": 345852.0,
|
||||
"step": 120
|
||||
},
|
||||
{
|
||||
"epoch": 0.7784431137724551,
|
||||
"grad_norm": 4.84375,
|
||||
"learning_rate": 4.3803418803418805e-05,
|
||||
"loss": 2.0083,
|
||||
"mean_token_accuracy": 0.5502024054527282,
|
||||
"num_tokens": 375046.0,
|
||||
"step": 130
|
||||
},
|
||||
{
|
||||
"epoch": 0.8383233532934131,
|
||||
"grad_norm": 4.59375,
|
||||
"learning_rate": 4.166666666666667e-05,
|
||||
"loss": 1.9348,
|
||||
"mean_token_accuracy": 0.563525739312172,
|
||||
"num_tokens": 403950.0,
|
||||
"step": 140
|
||||
},
|
||||
{
|
||||
"epoch": 0.8982035928143712,
|
||||
"grad_norm": 4.90625,
|
||||
"learning_rate": 3.952991452991453e-05,
|
||||
"loss": 1.8796,
|
||||
"mean_token_accuracy": 0.5778123795986175,
|
||||
"num_tokens": 433871.0,
|
||||
"step": 150
|
||||
},
|
||||
{
|
||||
"epoch": 0.9580838323353293,
|
||||
"grad_norm": 4.375,
|
||||
"learning_rate": 3.739316239316239e-05,
|
||||
"loss": 2.0021,
|
||||
"mean_token_accuracy": 0.5490101099014282,
|
||||
"num_tokens": 463543.0,
|
||||
"step": 160
|
||||
},
|
||||
{
|
||||
"epoch": 1.0,
|
||||
"eval_loss": 1.9482988119125366,
|
||||
"eval_mean_token_accuracy": 0.5604007748457102,
|
||||
"eval_num_tokens": 482617.0,
|
||||
"eval_runtime": 39.6692,
|
||||
"eval_samples_per_second": 2.521,
|
||||
"eval_steps_per_second": 0.328,
|
||||
"step": 167
|
||||
},
|
||||
{
|
||||
"epoch": 1.0179640718562875,
|
||||
"grad_norm": 4.53125,
|
||||
"learning_rate": 3.525641025641026e-05,
|
||||
"loss": 1.8939,
|
||||
"mean_token_accuracy": 0.5751068115234375,
|
||||
"num_tokens": 491782.0,
|
||||
"step": 170
|
||||
},
|
||||
{
|
||||
"epoch": 1.0778443113772456,
|
||||
"grad_norm": 4.25,
|
||||
"learning_rate": 3.311965811965812e-05,
|
||||
"loss": 1.572,
|
||||
"mean_token_accuracy": 0.6416267931461335,
|
||||
"num_tokens": 521599.0,
|
||||
"step": 180
|
||||
},
|
||||
{
|
||||
"epoch": 1.1377245508982037,
|
||||
"grad_norm": 4.71875,
|
||||
"learning_rate": 3.098290598290599e-05,
|
||||
"loss": 1.5429,
|
||||
"mean_token_accuracy": 0.6442601144313812,
|
||||
"num_tokens": 550968.0,
|
||||
"step": 190
|
||||
},
|
||||
{
|
||||
"epoch": 1.1976047904191618,
|
||||
"grad_norm": 4.03125,
|
||||
"learning_rate": 2.8846153846153845e-05,
|
||||
"loss": 1.4855,
|
||||
"mean_token_accuracy": 0.6554409444332123,
|
||||
"num_tokens": 579850.0,
|
||||
"step": 200
|
||||
},
|
||||
{
|
||||
"epoch": 1.2574850299401197,
|
||||
"grad_norm": 4.53125,
|
||||
"learning_rate": 2.670940170940171e-05,
|
||||
"loss": 1.5925,
|
||||
"mean_token_accuracy": 0.6354905068874359,
|
||||
"num_tokens": 607566.0,
|
||||
"step": 210
|
||||
},
|
||||
{
|
||||
"epoch": 1.3173652694610778,
|
||||
"grad_norm": 4.34375,
|
||||
"learning_rate": 2.4572649572649573e-05,
|
||||
"loss": 1.6961,
|
||||
"mean_token_accuracy": 0.611818504333496,
|
||||
"num_tokens": 636366.0,
|
||||
"step": 220
|
||||
},
|
||||
{
|
||||
"epoch": 1.377245508982036,
|
||||
"grad_norm": 4.28125,
|
||||
"learning_rate": 2.2435897435897437e-05,
|
||||
"loss": 1.6871,
|
||||
"mean_token_accuracy": 0.6123433768749237,
|
||||
"num_tokens": 665501.0,
|
||||
"step": 230
|
||||
},
|
||||
{
|
||||
"epoch": 1.437125748502994,
|
||||
"grad_norm": 5.34375,
|
||||
"learning_rate": 2.02991452991453e-05,
|
||||
"loss": 1.6756,
|
||||
"mean_token_accuracy": 0.6171969532966614,
|
||||
"num_tokens": 692397.0,
|
||||
"step": 240
|
||||
},
|
||||
{
|
||||
"epoch": 1.4970059880239521,
|
||||
"grad_norm": 4.40625,
|
||||
"learning_rate": 1.8162393162393162e-05,
|
||||
"loss": 1.6237,
|
||||
"mean_token_accuracy": 0.6245103716850281,
|
||||
"num_tokens": 720330.0,
|
||||
"step": 250
|
||||
},
|
||||
{
|
||||
"epoch": 1.55688622754491,
|
||||
"grad_norm": 3.9375,
|
||||
"learning_rate": 1.602564102564103e-05,
|
||||
"loss": 1.6596,
|
||||
"mean_token_accuracy": 0.6197108209133149,
|
||||
"num_tokens": 747976.0,
|
||||
"step": 260
|
||||
},
|
||||
{
|
||||
"epoch": 1.6167664670658684,
|
||||
"grad_norm": 4.3125,
|
||||
"learning_rate": 1.388888888888889e-05,
|
||||
"loss": 1.6428,
|
||||
"mean_token_accuracy": 0.6236989557743072,
|
||||
"num_tokens": 776610.0,
|
||||
"step": 270
|
||||
},
|
||||
{
|
||||
"epoch": 1.6766467065868262,
|
||||
"grad_norm": 4.125,
|
||||
"learning_rate": 1.1752136752136752e-05,
|
||||
"loss": 1.6659,
|
||||
"mean_token_accuracy": 0.6144291937351227,
|
||||
"num_tokens": 806986.0,
|
||||
"step": 280
|
||||
},
|
||||
{
|
||||
"epoch": 1.7365269461077846,
|
||||
"grad_norm": 4.5625,
|
||||
"learning_rate": 9.615384615384616e-06,
|
||||
"loss": 1.6745,
|
||||
"mean_token_accuracy": 0.6144052445888519,
|
||||
"num_tokens": 835571.0,
|
||||
"step": 290
|
||||
},
|
||||
{
|
||||
"epoch": 1.7964071856287425,
|
||||
"grad_norm": 4.625,
|
||||
"learning_rate": 7.478632478632479e-06,
|
||||
"loss": 1.6576,
|
||||
"mean_token_accuracy": 0.6180627286434174,
|
||||
"num_tokens": 865294.0,
|
||||
"step": 300
|
||||
},
|
||||
{
|
||||
"epoch": 1.8562874251497006,
|
||||
"grad_norm": 4.25,
|
||||
"learning_rate": 5.341880341880342e-06,
|
||||
"loss": 1.6627,
|
||||
"mean_token_accuracy": 0.6169491648674011,
|
||||
"num_tokens": 894249.0,
|
||||
"step": 310
|
||||
},
|
||||
{
|
||||
"epoch": 1.9161676646706587,
|
||||
"grad_norm": 4.96875,
|
||||
"learning_rate": 3.205128205128205e-06,
|
||||
"loss": 1.5248,
|
||||
"mean_token_accuracy": 0.6464354753494262,
|
||||
"num_tokens": 924046.0,
|
||||
"step": 320
|
||||
},
|
||||
{
|
||||
"epoch": 1.9760479041916168,
|
||||
"grad_norm": 4.09375,
|
||||
"learning_rate": 1.0683760683760685e-06,
|
||||
"loss": 1.6545,
|
||||
"mean_token_accuracy": 0.6218094885349273,
|
||||
"num_tokens": 954354.0,
|
||||
"step": 330
|
||||
},
|
||||
{
|
||||
"epoch": 2.0,
|
||||
"eval_loss": 1.941367506980896,
|
||||
"eval_mean_token_accuracy": 0.564078491467696,
|
||||
"eval_num_tokens": 965234.0,
|
||||
"eval_runtime": 39.4675,
|
||||
"eval_samples_per_second": 2.534,
|
||||
"eval_steps_per_second": 0.329,
|
||||
"step": 334
|
||||
},
|
||||
{
|
||||
"epoch": 2.035928143712575,
|
||||
"grad_norm": 4.375,
|
||||
"learning_rate": 2.0199501246882794e-05,
|
||||
"loss": 1.3888,
|
||||
"mean_token_accuracy": 0.6779806514581045,
|
||||
"num_tokens": 16794.0,
|
||||
"step": 340
|
||||
},
|
||||
{
|
||||
"epoch": 2.095808383233533,
|
||||
"grad_norm": 4.625,
|
||||
"learning_rate": 1.8952618453865337e-05,
|
||||
"loss": 1.5284,
|
||||
"mean_token_accuracy": 0.6481667637825013,
|
||||
"num_tokens": 44874.0,
|
||||
"step": 350
|
||||
},
|
||||
{
|
||||
"epoch": 2.155688622754491,
|
||||
"grad_norm": 5.90625,
|
||||
"learning_rate": 1.770573566084788e-05,
|
||||
"loss": 1.5412,
|
||||
"mean_token_accuracy": 0.6448413729667664,
|
||||
"num_tokens": 71727.0,
|
||||
"step": 360
|
||||
},
|
||||
{
|
||||
"epoch": 2.215568862275449,
|
||||
"grad_norm": 4.3125,
|
||||
"learning_rate": 1.6458852867830423e-05,
|
||||
"loss": 1.5195,
|
||||
"mean_token_accuracy": 0.6477404713630677,
|
||||
"num_tokens": 101889.0,
|
||||
"step": 370
|
||||
},
|
||||
{
|
||||
"epoch": 2.2754491017964074,
|
||||
"grad_norm": 4.0,
|
||||
"learning_rate": 1.5211970074812968e-05,
|
||||
"loss": 1.4846,
|
||||
"mean_token_accuracy": 0.6572466909885406,
|
||||
"num_tokens": 132140.0,
|
||||
"step": 380
|
||||
},
|
||||
{
|
||||
"epoch": 2.3353293413173652,
|
||||
"grad_norm": 4.3125,
|
||||
"learning_rate": 1.396508728179551e-05,
|
||||
"loss": 1.604,
|
||||
"mean_token_accuracy": 0.6328544735908508,
|
||||
"num_tokens": 161063.0,
|
||||
"step": 390
|
||||
},
|
||||
{
|
||||
"epoch": 2.3952095808383236,
|
||||
"grad_norm": 4.71875,
|
||||
"learning_rate": 1.2718204488778054e-05,
|
||||
"loss": 1.5462,
|
||||
"mean_token_accuracy": 0.6424768209457398,
|
||||
"num_tokens": 189798.0,
|
||||
"step": 400
|
||||
},
|
||||
{
|
||||
"epoch": 2.4550898203592815,
|
||||
"grad_norm": 4.34375,
|
||||
"learning_rate": 1.1471321695760599e-05,
|
||||
"loss": 1.5468,
|
||||
"mean_token_accuracy": 0.6386782228946686,
|
||||
"num_tokens": 219194.0,
|
||||
"step": 410
|
||||
},
|
||||
{
|
||||
"epoch": 2.5149700598802394,
|
||||
"grad_norm": 4.65625,
|
||||
"learning_rate": 1.0224438902743143e-05,
|
||||
"loss": 1.5713,
|
||||
"mean_token_accuracy": 0.6371028661727905,
|
||||
"num_tokens": 249445.0,
|
||||
"step": 420
|
||||
},
|
||||
{
|
||||
"epoch": 2.5748502994011977,
|
||||
"grad_norm": 3.875,
|
||||
"learning_rate": 8.977556109725686e-06,
|
||||
"loss": 1.4073,
|
||||
"mean_token_accuracy": 0.6741897523403168,
|
||||
"num_tokens": 277096.0,
|
||||
"step": 430
|
||||
},
|
||||
{
|
||||
"epoch": 2.6347305389221556,
|
||||
"grad_norm": 5.15625,
|
||||
"learning_rate": 7.73067331670823e-06,
|
||||
"loss": 1.5873,
|
||||
"mean_token_accuracy": 0.6345809698104858,
|
||||
"num_tokens": 306656.0,
|
||||
"step": 440
|
||||
},
|
||||
{
|
||||
"epoch": 2.694610778443114,
|
||||
"grad_norm": 4.125,
|
||||
"learning_rate": 6.483790523690773e-06,
|
||||
"loss": 1.5398,
|
||||
"mean_token_accuracy": 0.6446067214012146,
|
||||
"num_tokens": 334989.0,
|
||||
"step": 450
|
||||
},
|
||||
{
|
||||
"epoch": 2.754491017964072,
|
||||
"grad_norm": 4.78125,
|
||||
"learning_rate": 5.236907730673317e-06,
|
||||
"loss": 1.4324,
|
||||
"mean_token_accuracy": 0.6648351371288299,
|
||||
"num_tokens": 363393.0,
|
||||
"step": 460
|
||||
},
|
||||
{
|
||||
"epoch": 2.81437125748503,
|
||||
"grad_norm": 4.375,
|
||||
"learning_rate": 3.99002493765586e-06,
|
||||
"loss": 1.4939,
|
||||
"mean_token_accuracy": 0.6552098572254181,
|
||||
"num_tokens": 392445.0,
|
||||
"step": 470
|
||||
},
|
||||
{
|
||||
"epoch": 2.874251497005988,
|
||||
"grad_norm": 4.625,
|
||||
"learning_rate": 2.743142144638404e-06,
|
||||
"loss": 1.5451,
|
||||
"mean_token_accuracy": 0.6401039361953735,
|
||||
"num_tokens": 421549.0,
|
||||
"step": 480
|
||||
},
|
||||
{
|
||||
"epoch": 2.934131736526946,
|
||||
"grad_norm": 4.9375,
|
||||
"learning_rate": 1.4962593516209476e-06,
|
||||
"loss": 1.5018,
|
||||
"mean_token_accuracy": 0.6498535394668579,
|
||||
"num_tokens": 450516.0,
|
||||
"step": 490
|
||||
},
|
||||
{
|
||||
"epoch": 2.9940119760479043,
|
||||
"grad_norm": 4.8125,
|
||||
"learning_rate": 2.4937655860349126e-07,
|
||||
"loss": 1.502,
|
||||
"mean_token_accuracy": 0.651291674375534,
|
||||
"num_tokens": 480685.0,
|
||||
"step": 500
|
||||
},
|
||||
{
|
||||
"epoch": 3.0,
|
||||
"eval_loss": 1.9642236232757568,
|
||||
"eval_mean_token_accuracy": 0.5623479668910687,
|
||||
"eval_num_tokens": 482617.0,
|
||||
"eval_runtime": 39.682,
|
||||
"eval_samples_per_second": 2.52,
|
||||
"eval_steps_per_second": 0.328,
|
||||
"step": 501
|
||||
}
|
||||
],
|
||||
"logging_steps": 10,
|
||||
"max_steps": 501,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 3,
|
||||
"save_steps": 500,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": true
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 2647290262044672.0,
|
||||
"train_batch_size": 3,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
last-checkpoint/training_args.bin
Normal file
3
last-checkpoint/training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
|
||||
size 6161
|
||||
3
model.safetensors
Normal file
3
model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
|
||||
size 708984464
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ecb847ee90726b4abe0c66ebdd31ab3934722a7e8815cb81d50774a1a4ceb1cd
|
||||
size 7001
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7e703d3d3c34c61650f8af92ebae0b19efa998caec5e19fdce81da57cffb05dd
|
||||
size 12615
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:be2dd8537a167d5d53fdcc94b5dc73cf39193cd2884a775f0507c8f1e0e52112
|
||||
size 6029
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:98dbb1c20ba50409e061913a84d58f2edbc604b32147f47535808e53fbc9a8d3
|
||||
size 17977
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:17ab5aa730b700aa9329747cec3f3f2e444669c6922c393bc7b9f7ba2fc388e3
|
||||
size 6502
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:717930e75e2e44e089ccb908f3dd2ae0efedc6cc94050105ee0c8b9167e13e70
|
||||
size 12381
|
||||
23
special_tokens_map.json
Normal file
23
special_tokens_map.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"bos_token": {
|
||||
"content": "<|startoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"eos_token": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
},
|
||||
"pad_token": {
|
||||
"content": "<|pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false
|
||||
}
|
||||
}
|
||||
323812
tokenizer.json
Normal file
323812
tokenizer.json
Normal file
File diff suppressed because it is too large
Load Diff
4078
tokenizer_config.json
Normal file
4078
tokenizer_config.json
Normal file
File diff suppressed because it is too large
Load Diff
3
training_args.bin
Normal file
3
training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
|
||||
size 6161
|
||||
Reference in New Issue
Block a user