初始化项目,由ModelHub XC社区提供模型

Model: MahmoudIbrahim/Summary-0.1
Source: Original Platform
This commit is contained in:
ModelHub XC
2026-05-27 15:12:24 +08:00
commit a5967341f6
28 changed files with 656674 additions and 0 deletions

35
.gitattributes vendored Normal file
View File

@@ -0,0 +1,35 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text

58
README.md Normal file
View File

@@ -0,0 +1,58 @@
---
base_model: MahmoudIbrahim/Summary-0.1
library_name: transformers
model_name: Summary-0.1
tags:
- generated_from_trainer
- trl
- sft
licence: license
---
# Model Card for Summary-0.1
This model is a fine-tuned version of [MahmoudIbrahim/Summary-0.1](https://huggingface.co/MahmoudIbrahim/Summary-0.1).
It has been trained using [TRL](https://github.com/huggingface/trl).
## Quick start
```python
from transformers import pipeline
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
generator = pipeline("text-generation", model="MahmoudIbrahim/Summary-0.1", device="cuda")
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
print(output["generated_text"])
```
## Training procedure
This model was trained with SFT.
### Framework versions
- TRL: 0.20.0
- Transformers: 4.54.1
- Pytorch: 2.10.0+cu128
- Datasets: 4.0.0
- Tokenizers: 0.21.4
## Citations
Cite TRL as:
```bibtex
@misc{vonwerra2022trl,
title = {{TRL: Transformer Reinforcement Learning}},
author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
year = 2020,
journal = {GitHub repository},
publisher = {GitHub},
howpublished = {\url{https://github.com/huggingface/trl}}
}
```

37
chat_template.jinja Normal file
View File

@@ -0,0 +1,37 @@
{{- bos_token -}}
{%- set system_prompt = "" -%}
{%- set ns = namespace(system_prompt="") -%}
{%- if messages[0]["role"] == "system" -%}
{%- set ns.system_prompt = messages[0]["content"] -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message["role"] + "\n" -}}
{%- set content = message["content"] -%}
{%- if content is not string -%}
{%- set content = content | tojson -%}
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}

57
config.json Normal file
View File

@@ -0,0 +1,57 @@
{
"architectures": [
"Lfm2ForCausalLM"
],
"block_auto_adjust_ff_dim": true,
"block_dim": 1024,
"block_ff_dim": 6656,
"block_ffn_dim_multiplier": 1.0,
"block_mlp_init_scale": 1.0,
"block_multiple_of": 256,
"block_norm_eps": 1e-05,
"block_out_init_scale": 1.0,
"block_use_swiglu": true,
"block_use_xavier_init": true,
"bos_token_id": 1,
"conv_L_cache": 3,
"conv_bias": false,
"conv_dim": 1024,
"conv_dim_out": 1024,
"conv_use_xavier_init": true,
"eos_token_id": 7,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 6656,
"layer_types": [
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv"
],
"max_position_embeddings": 128000,
"model_type": "lfm2",
"norm_eps": 1e-05,
"num_attention_heads": 16,
"num_heads": 16,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"pad_token_id": 0,
"rope_theta": 1000000.0,
"torch_dtype": "bfloat16",
"transformers_version": "4.54.1",
"use_cache": true,
"use_pos_enc": true,
"vocab_size": 65536
}

7
generation_config.json Normal file
View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"eos_token_id": 7,
"pad_token_id": 0,
"transformers_version": "4.54.1"
}

View File

@@ -0,0 +1,37 @@
{{- bos_token -}}
{%- set system_prompt = "" -%}
{%- set ns = namespace(system_prompt="") -%}
{%- if messages[0]["role"] == "system" -%}
{%- set ns.system_prompt = messages[0]["content"] -%}
{%- set messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
{%- for tool in tools -%}
{%- if tool is not string -%}
{%- set tool = tool | tojson -%}
{%- endif -%}
{%- set ns.system_prompt = ns.system_prompt + tool -%}
{%- if not loop.last -%}
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
{%- endif -%}
{%- endfor -%}
{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
{%- endif -%}
{%- if ns.system_prompt -%}
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
{%- endif -%}
{%- for message in messages -%}
{{- "<|im_start|>" + message["role"] + "\n" -}}
{%- set content = message["content"] -%}
{%- if content is not string -%}
{%- set content = content | tojson -%}
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
{%- endif -%}
{{- content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}

View File

@@ -0,0 +1,57 @@
{
"architectures": [
"Lfm2ForCausalLM"
],
"block_auto_adjust_ff_dim": true,
"block_dim": 1024,
"block_ff_dim": 6656,
"block_ffn_dim_multiplier": 1.0,
"block_mlp_init_scale": 1.0,
"block_multiple_of": 256,
"block_norm_eps": 1e-05,
"block_out_init_scale": 1.0,
"block_use_swiglu": true,
"block_use_xavier_init": true,
"bos_token_id": 1,
"conv_L_cache": 3,
"conv_bias": false,
"conv_dim": 1024,
"conv_dim_out": 1024,
"conv_use_xavier_init": true,
"eos_token_id": 7,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 6656,
"layer_types": [
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv"
],
"max_position_embeddings": 128000,
"model_type": "lfm2",
"norm_eps": 1e-05,
"num_attention_heads": 16,
"num_heads": 16,
"num_hidden_layers": 16,
"num_key_value_heads": 8,
"pad_token_id": 0,
"rope_theta": 1000000.0,
"torch_dtype": "bfloat16",
"transformers_version": "4.54.1",
"use_cache": true,
"use_pos_enc": true,
"vocab_size": 65536
}

View File

@@ -0,0 +1,7 @@
{
"_from_model_config": true,
"bos_token_id": 1,
"eos_token_id": 7,
"pad_token_id": 0,
"transformers_version": "4.54.1"
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
size 708984464

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6ff715c5c4b7242e62e8b0f87aa52e4c61b186ff4bcf9bd6ae6f996717938936
size 1418063051

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
size 14645

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:84bf56f1dd07d671cce095f4098c57df5ec6e431abdf62fc0db9c16e7ca046e1
size 1465

View File

@@ -0,0 +1,23 @@
{
"bos_token": {
"content": "<|startoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

323812
last-checkpoint/tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,514 @@
{
"best_global_step": 334,
"best_metric": 1.941367506980896,
"best_model_checkpoint": "./Summary-0.1/checkpoint-334",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 501,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.059880239520958084,
"grad_norm": 38.75,
"learning_rate": 4.5e-06,
"loss": 2.7202,
"mean_token_accuracy": 0.45009988248348237,
"num_tokens": 29469.0,
"step": 10
},
{
"epoch": 0.11976047904191617,
"grad_norm": 9.25,
"learning_rate": 9.5e-06,
"loss": 2.2626,
"mean_token_accuracy": 0.5219378590583801,
"num_tokens": 58579.0,
"step": 20
},
{
"epoch": 0.17964071856287425,
"grad_norm": 6.375,
"learning_rate": 1.45e-05,
"loss": 2.2181,
"mean_token_accuracy": 0.518680065870285,
"num_tokens": 88193.0,
"step": 30
},
{
"epoch": 0.23952095808383234,
"grad_norm": 5.9375,
"learning_rate": 1.9500000000000003e-05,
"loss": 2.0667,
"mean_token_accuracy": 0.5430938720703125,
"num_tokens": 116933.0,
"step": 40
},
{
"epoch": 0.2994011976047904,
"grad_norm": 5.65625,
"learning_rate": 2.45e-05,
"loss": 2.1906,
"mean_token_accuracy": 0.5175440430641174,
"num_tokens": 146513.0,
"step": 50
},
{
"epoch": 0.3592814371257485,
"grad_norm": 5.9375,
"learning_rate": 2.95e-05,
"loss": 2.092,
"mean_token_accuracy": 0.5363078862428665,
"num_tokens": 175698.0,
"step": 60
},
{
"epoch": 0.41916167664670656,
"grad_norm": 4.9375,
"learning_rate": 3.45e-05,
"loss": 1.9358,
"mean_token_accuracy": 0.568104338645935,
"num_tokens": 202607.0,
"step": 70
},
{
"epoch": 0.47904191616766467,
"grad_norm": 5.1875,
"learning_rate": 3.9500000000000005e-05,
"loss": 2.0201,
"mean_token_accuracy": 0.550568813085556,
"num_tokens": 231199.0,
"step": 80
},
{
"epoch": 0.5389221556886228,
"grad_norm": 5.03125,
"learning_rate": 4.4500000000000004e-05,
"loss": 1.9076,
"mean_token_accuracy": 0.5696590662002563,
"num_tokens": 260064.0,
"step": 90
},
{
"epoch": 0.5988023952095808,
"grad_norm": 5.03125,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.9292,
"mean_token_accuracy": 0.5686947405338287,
"num_tokens": 288635.0,
"step": 100
},
{
"epoch": 0.6586826347305389,
"grad_norm": 5.40625,
"learning_rate": 4.8076923076923084e-05,
"loss": 2.0754,
"mean_token_accuracy": 0.5396111845970154,
"num_tokens": 315994.0,
"step": 110
},
{
"epoch": 0.718562874251497,
"grad_norm": 4.78125,
"learning_rate": 4.594017094017094e-05,
"loss": 1.942,
"mean_token_accuracy": 0.567721825838089,
"num_tokens": 345852.0,
"step": 120
},
{
"epoch": 0.7784431137724551,
"grad_norm": 4.84375,
"learning_rate": 4.3803418803418805e-05,
"loss": 2.0083,
"mean_token_accuracy": 0.5502024054527282,
"num_tokens": 375046.0,
"step": 130
},
{
"epoch": 0.8383233532934131,
"grad_norm": 4.59375,
"learning_rate": 4.166666666666667e-05,
"loss": 1.9348,
"mean_token_accuracy": 0.563525739312172,
"num_tokens": 403950.0,
"step": 140
},
{
"epoch": 0.8982035928143712,
"grad_norm": 4.90625,
"learning_rate": 3.952991452991453e-05,
"loss": 1.8796,
"mean_token_accuracy": 0.5778123795986175,
"num_tokens": 433871.0,
"step": 150
},
{
"epoch": 0.9580838323353293,
"grad_norm": 4.375,
"learning_rate": 3.739316239316239e-05,
"loss": 2.0021,
"mean_token_accuracy": 0.5490101099014282,
"num_tokens": 463543.0,
"step": 160
},
{
"epoch": 1.0,
"eval_loss": 1.9482988119125366,
"eval_mean_token_accuracy": 0.5604007748457102,
"eval_num_tokens": 482617.0,
"eval_runtime": 39.6692,
"eval_samples_per_second": 2.521,
"eval_steps_per_second": 0.328,
"step": 167
},
{
"epoch": 1.0179640718562875,
"grad_norm": 4.53125,
"learning_rate": 3.525641025641026e-05,
"loss": 1.8939,
"mean_token_accuracy": 0.5751068115234375,
"num_tokens": 491782.0,
"step": 170
},
{
"epoch": 1.0778443113772456,
"grad_norm": 4.25,
"learning_rate": 3.311965811965812e-05,
"loss": 1.572,
"mean_token_accuracy": 0.6416267931461335,
"num_tokens": 521599.0,
"step": 180
},
{
"epoch": 1.1377245508982037,
"grad_norm": 4.71875,
"learning_rate": 3.098290598290599e-05,
"loss": 1.5429,
"mean_token_accuracy": 0.6442601144313812,
"num_tokens": 550968.0,
"step": 190
},
{
"epoch": 1.1976047904191618,
"grad_norm": 4.03125,
"learning_rate": 2.8846153846153845e-05,
"loss": 1.4855,
"mean_token_accuracy": 0.6554409444332123,
"num_tokens": 579850.0,
"step": 200
},
{
"epoch": 1.2574850299401197,
"grad_norm": 4.53125,
"learning_rate": 2.670940170940171e-05,
"loss": 1.5925,
"mean_token_accuracy": 0.6354905068874359,
"num_tokens": 607566.0,
"step": 210
},
{
"epoch": 1.3173652694610778,
"grad_norm": 4.34375,
"learning_rate": 2.4572649572649573e-05,
"loss": 1.6961,
"mean_token_accuracy": 0.611818504333496,
"num_tokens": 636366.0,
"step": 220
},
{
"epoch": 1.377245508982036,
"grad_norm": 4.28125,
"learning_rate": 2.2435897435897437e-05,
"loss": 1.6871,
"mean_token_accuracy": 0.6123433768749237,
"num_tokens": 665501.0,
"step": 230
},
{
"epoch": 1.437125748502994,
"grad_norm": 5.34375,
"learning_rate": 2.02991452991453e-05,
"loss": 1.6756,
"mean_token_accuracy": 0.6171969532966614,
"num_tokens": 692397.0,
"step": 240
},
{
"epoch": 1.4970059880239521,
"grad_norm": 4.40625,
"learning_rate": 1.8162393162393162e-05,
"loss": 1.6237,
"mean_token_accuracy": 0.6245103716850281,
"num_tokens": 720330.0,
"step": 250
},
{
"epoch": 1.55688622754491,
"grad_norm": 3.9375,
"learning_rate": 1.602564102564103e-05,
"loss": 1.6596,
"mean_token_accuracy": 0.6197108209133149,
"num_tokens": 747976.0,
"step": 260
},
{
"epoch": 1.6167664670658684,
"grad_norm": 4.3125,
"learning_rate": 1.388888888888889e-05,
"loss": 1.6428,
"mean_token_accuracy": 0.6236989557743072,
"num_tokens": 776610.0,
"step": 270
},
{
"epoch": 1.6766467065868262,
"grad_norm": 4.125,
"learning_rate": 1.1752136752136752e-05,
"loss": 1.6659,
"mean_token_accuracy": 0.6144291937351227,
"num_tokens": 806986.0,
"step": 280
},
{
"epoch": 1.7365269461077846,
"grad_norm": 4.5625,
"learning_rate": 9.615384615384616e-06,
"loss": 1.6745,
"mean_token_accuracy": 0.6144052445888519,
"num_tokens": 835571.0,
"step": 290
},
{
"epoch": 1.7964071856287425,
"grad_norm": 4.625,
"learning_rate": 7.478632478632479e-06,
"loss": 1.6576,
"mean_token_accuracy": 0.6180627286434174,
"num_tokens": 865294.0,
"step": 300
},
{
"epoch": 1.8562874251497006,
"grad_norm": 4.25,
"learning_rate": 5.341880341880342e-06,
"loss": 1.6627,
"mean_token_accuracy": 0.6169491648674011,
"num_tokens": 894249.0,
"step": 310
},
{
"epoch": 1.9161676646706587,
"grad_norm": 4.96875,
"learning_rate": 3.205128205128205e-06,
"loss": 1.5248,
"mean_token_accuracy": 0.6464354753494262,
"num_tokens": 924046.0,
"step": 320
},
{
"epoch": 1.9760479041916168,
"grad_norm": 4.09375,
"learning_rate": 1.0683760683760685e-06,
"loss": 1.6545,
"mean_token_accuracy": 0.6218094885349273,
"num_tokens": 954354.0,
"step": 330
},
{
"epoch": 2.0,
"eval_loss": 1.941367506980896,
"eval_mean_token_accuracy": 0.564078491467696,
"eval_num_tokens": 965234.0,
"eval_runtime": 39.4675,
"eval_samples_per_second": 2.534,
"eval_steps_per_second": 0.329,
"step": 334
},
{
"epoch": 2.035928143712575,
"grad_norm": 4.375,
"learning_rate": 2.0199501246882794e-05,
"loss": 1.3888,
"mean_token_accuracy": 0.6779806514581045,
"num_tokens": 16794.0,
"step": 340
},
{
"epoch": 2.095808383233533,
"grad_norm": 4.625,
"learning_rate": 1.8952618453865337e-05,
"loss": 1.5284,
"mean_token_accuracy": 0.6481667637825013,
"num_tokens": 44874.0,
"step": 350
},
{
"epoch": 2.155688622754491,
"grad_norm": 5.90625,
"learning_rate": 1.770573566084788e-05,
"loss": 1.5412,
"mean_token_accuracy": 0.6448413729667664,
"num_tokens": 71727.0,
"step": 360
},
{
"epoch": 2.215568862275449,
"grad_norm": 4.3125,
"learning_rate": 1.6458852867830423e-05,
"loss": 1.5195,
"mean_token_accuracy": 0.6477404713630677,
"num_tokens": 101889.0,
"step": 370
},
{
"epoch": 2.2754491017964074,
"grad_norm": 4.0,
"learning_rate": 1.5211970074812968e-05,
"loss": 1.4846,
"mean_token_accuracy": 0.6572466909885406,
"num_tokens": 132140.0,
"step": 380
},
{
"epoch": 2.3353293413173652,
"grad_norm": 4.3125,
"learning_rate": 1.396508728179551e-05,
"loss": 1.604,
"mean_token_accuracy": 0.6328544735908508,
"num_tokens": 161063.0,
"step": 390
},
{
"epoch": 2.3952095808383236,
"grad_norm": 4.71875,
"learning_rate": 1.2718204488778054e-05,
"loss": 1.5462,
"mean_token_accuracy": 0.6424768209457398,
"num_tokens": 189798.0,
"step": 400
},
{
"epoch": 2.4550898203592815,
"grad_norm": 4.34375,
"learning_rate": 1.1471321695760599e-05,
"loss": 1.5468,
"mean_token_accuracy": 0.6386782228946686,
"num_tokens": 219194.0,
"step": 410
},
{
"epoch": 2.5149700598802394,
"grad_norm": 4.65625,
"learning_rate": 1.0224438902743143e-05,
"loss": 1.5713,
"mean_token_accuracy": 0.6371028661727905,
"num_tokens": 249445.0,
"step": 420
},
{
"epoch": 2.5748502994011977,
"grad_norm": 3.875,
"learning_rate": 8.977556109725686e-06,
"loss": 1.4073,
"mean_token_accuracy": 0.6741897523403168,
"num_tokens": 277096.0,
"step": 430
},
{
"epoch": 2.6347305389221556,
"grad_norm": 5.15625,
"learning_rate": 7.73067331670823e-06,
"loss": 1.5873,
"mean_token_accuracy": 0.6345809698104858,
"num_tokens": 306656.0,
"step": 440
},
{
"epoch": 2.694610778443114,
"grad_norm": 4.125,
"learning_rate": 6.483790523690773e-06,
"loss": 1.5398,
"mean_token_accuracy": 0.6446067214012146,
"num_tokens": 334989.0,
"step": 450
},
{
"epoch": 2.754491017964072,
"grad_norm": 4.78125,
"learning_rate": 5.236907730673317e-06,
"loss": 1.4324,
"mean_token_accuracy": 0.6648351371288299,
"num_tokens": 363393.0,
"step": 460
},
{
"epoch": 2.81437125748503,
"grad_norm": 4.375,
"learning_rate": 3.99002493765586e-06,
"loss": 1.4939,
"mean_token_accuracy": 0.6552098572254181,
"num_tokens": 392445.0,
"step": 470
},
{
"epoch": 2.874251497005988,
"grad_norm": 4.625,
"learning_rate": 2.743142144638404e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.6401039361953735,
"num_tokens": 421549.0,
"step": 480
},
{
"epoch": 2.934131736526946,
"grad_norm": 4.9375,
"learning_rate": 1.4962593516209476e-06,
"loss": 1.5018,
"mean_token_accuracy": 0.6498535394668579,
"num_tokens": 450516.0,
"step": 490
},
{
"epoch": 2.9940119760479043,
"grad_norm": 4.8125,
"learning_rate": 2.4937655860349126e-07,
"loss": 1.502,
"mean_token_accuracy": 0.651291674375534,
"num_tokens": 480685.0,
"step": 500
},
{
"epoch": 3.0,
"eval_loss": 1.9642236232757568,
"eval_mean_token_accuracy": 0.5623479668910687,
"eval_num_tokens": 482617.0,
"eval_runtime": 39.682,
"eval_samples_per_second": 2.52,
"eval_steps_per_second": 0.328,
"step": 501
}
],
"logging_steps": 10,
"max_steps": 501,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2647290262044672.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
size 6161

3
model.safetensors Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
size 708984464

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ecb847ee90726b4abe0c66ebdd31ab3934722a7e8815cb81d50774a1a4ceb1cd
size 7001

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:7e703d3d3c34c61650f8af92ebae0b19efa998caec5e19fdce81da57cffb05dd
size 12615

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:be2dd8537a167d5d53fdcc94b5dc73cf39193cd2884a775f0507c8f1e0e52112
size 6029

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:98dbb1c20ba50409e061913a84d58f2edbc604b32147f47535808e53fbc9a8d3
size 17977

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:17ab5aa730b700aa9329747cec3f3f2e444669c6922c393bc7b9f7ba2fc388e3
size 6502

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:717930e75e2e44e089ccb908f3dd2ae0efedc6cc94050105ee0c8b9167e13e70
size 12381

23
special_tokens_map.json Normal file
View File

@@ -0,0 +1,23 @@
{
"bos_token": {
"content": "<|startoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"eos_token": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
},
"pad_token": {
"content": "<|pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false
}
}

323812
tokenizer.json Normal file

File diff suppressed because it is too large Load Diff

4078
tokenizer_config.json Normal file

File diff suppressed because it is too large Load Diff

3
training_args.bin Normal file
View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
size 6161