初始化项目，由ModelHub XC社区提供模型

Model: MahmoudIbrahim/Summary-0.1 Source: Original Platform
2026-05-27 15:12:24 +08:00
commit a5967341f6
28 changed files with 656674 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,58 @@
+---
+base_model: MahmoudIbrahim/Summary-0.1
+library_name: transformers
+model_name: Summary-0.1
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for Summary-0.1
+
+This model is a fine-tuned version of [MahmoudIbrahim/Summary-0.1](https://huggingface.co/MahmoudIbrahim/Summary-0.1).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="MahmoudIbrahim/Summary-0.1", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+ 
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.20.0
+- Transformers: 4.54.1
+- Pytorch: 2.10.0+cu128
+- Datasets: 4.0.0
+- Tokenizers: 0.21.4
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
--- a/chat_template.jinja
+++ b/chat_template.jinja
@@ -0,0 +1,37 @@
+{{- bos_token -}}
+{%- set system_prompt = "" -%}
+{%- set ns = namespace(system_prompt="") -%}
+{%- if messages[0]["role"] == "system" -%}
+	{%- set ns.system_prompt = messages[0]["content"] -%}
+	{%- set messages = messages[1:] -%}
+{%- endif -%}
+{%- if tools -%}
+	{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
+	{%- for tool in tools -%}
+		{%- if tool is not string -%}
+            {%- set tool = tool | tojson -%}
+		{%- endif -%}
+		{%- set ns.system_prompt = ns.system_prompt + tool -%}
+        {%- if not loop.last -%}
+            {%- set ns.system_prompt = ns.system_prompt + ", " -%}
+        {%- endif -%}
+	{%- endfor -%}
+	{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
+{%- endif -%}
+{%- if ns.system_prompt -%}
+	{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
+{%- endif -%}
+{%- for message in messages -%}
+	{{- "<|im_start|>" + message["role"] + "\n" -}}
+	{%- set content = message["content"] -%}
+	{%- if content is not string -%}
+		{%- set content = content | tojson -%}
+	{%- endif -%}
+	{%- if message["role"] == "tool" -%}
+		{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
+	{%- endif -%}
+	{{- content + "<|im_end|>\n" -}}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+	{{- "<|im_start|>assistant\n" -}}
+{%- endif -%}
--- a/config.json
+++ b/config.json
@@ -0,0 +1,57 @@
+{
+  "architectures": [
+    "Lfm2ForCausalLM"
+  ],
+  "block_auto_adjust_ff_dim": true,
+  "block_dim": 1024,
+  "block_ff_dim": 6656,
+  "block_ffn_dim_multiplier": 1.0,
+  "block_mlp_init_scale": 1.0,
+  "block_multiple_of": 256,
+  "block_norm_eps": 1e-05,
+  "block_out_init_scale": 1.0,
+  "block_use_swiglu": true,
+  "block_use_xavier_init": true,
+  "bos_token_id": 1,
+  "conv_L_cache": 3,
+  "conv_bias": false,
+  "conv_dim": 1024,
+  "conv_dim_out": 1024,
+  "conv_use_xavier_init": true,
+  "eos_token_id": 7,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 6656,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ],
+  "max_position_embeddings": 128000,
+  "model_type": "lfm2",
+  "norm_eps": 1e-05,
+  "num_attention_heads": 16,
+  "num_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "rope_theta": 1000000.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.1",
+  "use_cache": true,
+  "use_pos_enc": true,
+  "vocab_size": 65536
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 7,
+  "pad_token_id": 0,
+  "transformers_version": "4.54.1"
+}
--- a/last-checkpoint/chat_template.jinja
+++ b/last-checkpoint/chat_template.jinja
@@ -0,0 +1,37 @@
+{{- bos_token -}}
+{%- set system_prompt = "" -%}
+{%- set ns = namespace(system_prompt="") -%}
+{%- if messages[0]["role"] == "system" -%}
+	{%- set ns.system_prompt = messages[0]["content"] -%}
+	{%- set messages = messages[1:] -%}
+{%- endif -%}
+{%- if tools -%}
+	{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: <|tool_list_start|>[" -%}
+	{%- for tool in tools -%}
+		{%- if tool is not string -%}
+            {%- set tool = tool | tojson -%}
+		{%- endif -%}
+		{%- set ns.system_prompt = ns.system_prompt + tool -%}
+        {%- if not loop.last -%}
+            {%- set ns.system_prompt = ns.system_prompt + ", " -%}
+        {%- endif -%}
+	{%- endfor -%}
+	{%- set ns.system_prompt = ns.system_prompt + "]<|tool_list_end|>" -%}
+{%- endif -%}
+{%- if ns.system_prompt -%}
+	{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
+{%- endif -%}
+{%- for message in messages -%}
+	{{- "<|im_start|>" + message["role"] + "\n" -}}
+	{%- set content = message["content"] -%}
+	{%- if content is not string -%}
+		{%- set content = content | tojson -%}
+	{%- endif -%}
+	{%- if message["role"] == "tool" -%}
+		{%- set content = "<|tool_response_start|>" + content + "<|tool_response_end|>" -%}
+	{%- endif -%}
+	{{- content + "<|im_end|>\n" -}}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+	{{- "<|im_start|>assistant\n" -}}
+{%- endif -%}
--- a/last-checkpoint/config.json
+++ b/last-checkpoint/config.json
@@ -0,0 +1,57 @@
+{
+  "architectures": [
+    "Lfm2ForCausalLM"
+  ],
+  "block_auto_adjust_ff_dim": true,
+  "block_dim": 1024,
+  "block_ff_dim": 6656,
+  "block_ffn_dim_multiplier": 1.0,
+  "block_mlp_init_scale": 1.0,
+  "block_multiple_of": 256,
+  "block_norm_eps": 1e-05,
+  "block_out_init_scale": 1.0,
+  "block_use_swiglu": true,
+  "block_use_xavier_init": true,
+  "bos_token_id": 1,
+  "conv_L_cache": 3,
+  "conv_bias": false,
+  "conv_dim": 1024,
+  "conv_dim_out": 1024,
+  "conv_use_xavier_init": true,
+  "eos_token_id": 7,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 6656,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ],
+  "max_position_embeddings": 128000,
+  "model_type": "lfm2",
+  "norm_eps": 1e-05,
+  "num_attention_heads": 16,
+  "num_heads": 16,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "rope_theta": 1000000.0,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.1",
+  "use_cache": true,
+  "use_pos_enc": true,
+  "vocab_size": 65536
+}
--- a/last-checkpoint/generation_config.json
+++ b/last-checkpoint/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 7,
+  "pad_token_id": 0,
+  "transformers_version": "4.54.1"
+}
--- a/last-checkpoint/model.safetensors
+++ b/last-checkpoint/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
+size 708984464
--- a/last-checkpoint/optimizer.pt
+++ b/last-checkpoint/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ff715c5c4b7242e62e8b0f87aa52e4c61b186ff4bcf9bd6ae6f996717938936
+size 1418063051
--- a/last-checkpoint/rng_state.pth
+++ b/last-checkpoint/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
+size 14645
--- a/last-checkpoint/scheduler.pt
+++ b/last-checkpoint/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84bf56f1dd07d671cce095f4098c57df5ec6e431abdf62fc0db9c16e7ca046e1
+size 1465
--- a/last-checkpoint/special_tokens_map.json
+++ b/last-checkpoint/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/last-checkpoint/tokenizer.json
+++ b/last-checkpoint/tokenizer.json
--- a/last-checkpoint/tokenizer_config.json
+++ b/last-checkpoint/tokenizer_config.json
--- a/last-checkpoint/trainer_state.json
+++ b/last-checkpoint/trainer_state.json
@@ -0,0 +1,514 @@
+{
+  "best_global_step": 334,
+  "best_metric": 1.941367506980896,
+  "best_model_checkpoint": "./Summary-0.1/checkpoint-334",
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 501,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.059880239520958084,
+      "grad_norm": 38.75,
+      "learning_rate": 4.5e-06,
+      "loss": 2.7202,
+      "mean_token_accuracy": 0.45009988248348237,
+      "num_tokens": 29469.0,
+      "step": 10
+    },
+    {
+      "epoch": 0.11976047904191617,
+      "grad_norm": 9.25,
+      "learning_rate": 9.5e-06,
+      "loss": 2.2626,
+      "mean_token_accuracy": 0.5219378590583801,
+      "num_tokens": 58579.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.17964071856287425,
+      "grad_norm": 6.375,
+      "learning_rate": 1.45e-05,
+      "loss": 2.2181,
+      "mean_token_accuracy": 0.518680065870285,
+      "num_tokens": 88193.0,
+      "step": 30
+    },
+    {
+      "epoch": 0.23952095808383234,
+      "grad_norm": 5.9375,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 2.0667,
+      "mean_token_accuracy": 0.5430938720703125,
+      "num_tokens": 116933.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.2994011976047904,
+      "grad_norm": 5.65625,
+      "learning_rate": 2.45e-05,
+      "loss": 2.1906,
+      "mean_token_accuracy": 0.5175440430641174,
+      "num_tokens": 146513.0,
+      "step": 50
+    },
+    {
+      "epoch": 0.3592814371257485,
+      "grad_norm": 5.9375,
+      "learning_rate": 2.95e-05,
+      "loss": 2.092,
+      "mean_token_accuracy": 0.5363078862428665,
+      "num_tokens": 175698.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.41916167664670656,
+      "grad_norm": 4.9375,
+      "learning_rate": 3.45e-05,
+      "loss": 1.9358,
+      "mean_token_accuracy": 0.568104338645935,
+      "num_tokens": 202607.0,
+      "step": 70
+    },
+    {
+      "epoch": 0.47904191616766467,
+      "grad_norm": 5.1875,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 2.0201,
+      "mean_token_accuracy": 0.550568813085556,
+      "num_tokens": 231199.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.5389221556886228,
+      "grad_norm": 5.03125,
+      "learning_rate": 4.4500000000000004e-05,
+      "loss": 1.9076,
+      "mean_token_accuracy": 0.5696590662002563,
+      "num_tokens": 260064.0,
+      "step": 90
+    },
+    {
+      "epoch": 0.5988023952095808,
+      "grad_norm": 5.03125,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.9292,
+      "mean_token_accuracy": 0.5686947405338287,
+      "num_tokens": 288635.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.6586826347305389,
+      "grad_norm": 5.40625,
+      "learning_rate": 4.8076923076923084e-05,
+      "loss": 2.0754,
+      "mean_token_accuracy": 0.5396111845970154,
+      "num_tokens": 315994.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.718562874251497,
+      "grad_norm": 4.78125,
+      "learning_rate": 4.594017094017094e-05,
+      "loss": 1.942,
+      "mean_token_accuracy": 0.567721825838089,
+      "num_tokens": 345852.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.7784431137724551,
+      "grad_norm": 4.84375,
+      "learning_rate": 4.3803418803418805e-05,
+      "loss": 2.0083,
+      "mean_token_accuracy": 0.5502024054527282,
+      "num_tokens": 375046.0,
+      "step": 130
+    },
+    {
+      "epoch": 0.8383233532934131,
+      "grad_norm": 4.59375,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.9348,
+      "mean_token_accuracy": 0.563525739312172,
+      "num_tokens": 403950.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.8982035928143712,
+      "grad_norm": 4.90625,
+      "learning_rate": 3.952991452991453e-05,
+      "loss": 1.8796,
+      "mean_token_accuracy": 0.5778123795986175,
+      "num_tokens": 433871.0,
+      "step": 150
+    },
+    {
+      "epoch": 0.9580838323353293,
+      "grad_norm": 4.375,
+      "learning_rate": 3.739316239316239e-05,
+      "loss": 2.0021,
+      "mean_token_accuracy": 0.5490101099014282,
+      "num_tokens": 463543.0,
+      "step": 160
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.9482988119125366,
+      "eval_mean_token_accuracy": 0.5604007748457102,
+      "eval_num_tokens": 482617.0,
+      "eval_runtime": 39.6692,
+      "eval_samples_per_second": 2.521,
+      "eval_steps_per_second": 0.328,
+      "step": 167
+    },
+    {
+      "epoch": 1.0179640718562875,
+      "grad_norm": 4.53125,
+      "learning_rate": 3.525641025641026e-05,
+      "loss": 1.8939,
+      "mean_token_accuracy": 0.5751068115234375,
+      "num_tokens": 491782.0,
+      "step": 170
+    },
+    {
+      "epoch": 1.0778443113772456,
+      "grad_norm": 4.25,
+      "learning_rate": 3.311965811965812e-05,
+      "loss": 1.572,
+      "mean_token_accuracy": 0.6416267931461335,
+      "num_tokens": 521599.0,
+      "step": 180
+    },
+    {
+      "epoch": 1.1377245508982037,
+      "grad_norm": 4.71875,
+      "learning_rate": 3.098290598290599e-05,
+      "loss": 1.5429,
+      "mean_token_accuracy": 0.6442601144313812,
+      "num_tokens": 550968.0,
+      "step": 190
+    },
+    {
+      "epoch": 1.1976047904191618,
+      "grad_norm": 4.03125,
+      "learning_rate": 2.8846153846153845e-05,
+      "loss": 1.4855,
+      "mean_token_accuracy": 0.6554409444332123,
+      "num_tokens": 579850.0,
+      "step": 200
+    },
+    {
+      "epoch": 1.2574850299401197,
+      "grad_norm": 4.53125,
+      "learning_rate": 2.670940170940171e-05,
+      "loss": 1.5925,
+      "mean_token_accuracy": 0.6354905068874359,
+      "num_tokens": 607566.0,
+      "step": 210
+    },
+    {
+      "epoch": 1.3173652694610778,
+      "grad_norm": 4.34375,
+      "learning_rate": 2.4572649572649573e-05,
+      "loss": 1.6961,
+      "mean_token_accuracy": 0.611818504333496,
+      "num_tokens": 636366.0,
+      "step": 220
+    },
+    {
+      "epoch": 1.377245508982036,
+      "grad_norm": 4.28125,
+      "learning_rate": 2.2435897435897437e-05,
+      "loss": 1.6871,
+      "mean_token_accuracy": 0.6123433768749237,
+      "num_tokens": 665501.0,
+      "step": 230
+    },
+    {
+      "epoch": 1.437125748502994,
+      "grad_norm": 5.34375,
+      "learning_rate": 2.02991452991453e-05,
+      "loss": 1.6756,
+      "mean_token_accuracy": 0.6171969532966614,
+      "num_tokens": 692397.0,
+      "step": 240
+    },
+    {
+      "epoch": 1.4970059880239521,
+      "grad_norm": 4.40625,
+      "learning_rate": 1.8162393162393162e-05,
+      "loss": 1.6237,
+      "mean_token_accuracy": 0.6245103716850281,
+      "num_tokens": 720330.0,
+      "step": 250
+    },
+    {
+      "epoch": 1.55688622754491,
+      "grad_norm": 3.9375,
+      "learning_rate": 1.602564102564103e-05,
+      "loss": 1.6596,
+      "mean_token_accuracy": 0.6197108209133149,
+      "num_tokens": 747976.0,
+      "step": 260
+    },
+    {
+      "epoch": 1.6167664670658684,
+      "grad_norm": 4.3125,
+      "learning_rate": 1.388888888888889e-05,
+      "loss": 1.6428,
+      "mean_token_accuracy": 0.6236989557743072,
+      "num_tokens": 776610.0,
+      "step": 270
+    },
+    {
+      "epoch": 1.6766467065868262,
+      "grad_norm": 4.125,
+      "learning_rate": 1.1752136752136752e-05,
+      "loss": 1.6659,
+      "mean_token_accuracy": 0.6144291937351227,
+      "num_tokens": 806986.0,
+      "step": 280
+    },
+    {
+      "epoch": 1.7365269461077846,
+      "grad_norm": 4.5625,
+      "learning_rate": 9.615384615384616e-06,
+      "loss": 1.6745,
+      "mean_token_accuracy": 0.6144052445888519,
+      "num_tokens": 835571.0,
+      "step": 290
+    },
+    {
+      "epoch": 1.7964071856287425,
+      "grad_norm": 4.625,
+      "learning_rate": 7.478632478632479e-06,
+      "loss": 1.6576,
+      "mean_token_accuracy": 0.6180627286434174,
+      "num_tokens": 865294.0,
+      "step": 300
+    },
+    {
+      "epoch": 1.8562874251497006,
+      "grad_norm": 4.25,
+      "learning_rate": 5.341880341880342e-06,
+      "loss": 1.6627,
+      "mean_token_accuracy": 0.6169491648674011,
+      "num_tokens": 894249.0,
+      "step": 310
+    },
+    {
+      "epoch": 1.9161676646706587,
+      "grad_norm": 4.96875,
+      "learning_rate": 3.205128205128205e-06,
+      "loss": 1.5248,
+      "mean_token_accuracy": 0.6464354753494262,
+      "num_tokens": 924046.0,
+      "step": 320
+    },
+    {
+      "epoch": 1.9760479041916168,
+      "grad_norm": 4.09375,
+      "learning_rate": 1.0683760683760685e-06,
+      "loss": 1.6545,
+      "mean_token_accuracy": 0.6218094885349273,
+      "num_tokens": 954354.0,
+      "step": 330
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.941367506980896,
+      "eval_mean_token_accuracy": 0.564078491467696,
+      "eval_num_tokens": 965234.0,
+      "eval_runtime": 39.4675,
+      "eval_samples_per_second": 2.534,
+      "eval_steps_per_second": 0.329,
+      "step": 334
+    },
+    {
+      "epoch": 2.035928143712575,
+      "grad_norm": 4.375,
+      "learning_rate": 2.0199501246882794e-05,
+      "loss": 1.3888,
+      "mean_token_accuracy": 0.6779806514581045,
+      "num_tokens": 16794.0,
+      "step": 340
+    },
+    {
+      "epoch": 2.095808383233533,
+      "grad_norm": 4.625,
+      "learning_rate": 1.8952618453865337e-05,
+      "loss": 1.5284,
+      "mean_token_accuracy": 0.6481667637825013,
+      "num_tokens": 44874.0,
+      "step": 350
+    },
+    {
+      "epoch": 2.155688622754491,
+      "grad_norm": 5.90625,
+      "learning_rate": 1.770573566084788e-05,
+      "loss": 1.5412,
+      "mean_token_accuracy": 0.6448413729667664,
+      "num_tokens": 71727.0,
+      "step": 360
+    },
+    {
+      "epoch": 2.215568862275449,
+      "grad_norm": 4.3125,
+      "learning_rate": 1.6458852867830423e-05,
+      "loss": 1.5195,
+      "mean_token_accuracy": 0.6477404713630677,
+      "num_tokens": 101889.0,
+      "step": 370
+    },
+    {
+      "epoch": 2.2754491017964074,
+      "grad_norm": 4.0,
+      "learning_rate": 1.5211970074812968e-05,
+      "loss": 1.4846,
+      "mean_token_accuracy": 0.6572466909885406,
+      "num_tokens": 132140.0,
+      "step": 380
+    },
+    {
+      "epoch": 2.3353293413173652,
+      "grad_norm": 4.3125,
+      "learning_rate": 1.396508728179551e-05,
+      "loss": 1.604,
+      "mean_token_accuracy": 0.6328544735908508,
+      "num_tokens": 161063.0,
+      "step": 390
+    },
+    {
+      "epoch": 2.3952095808383236,
+      "grad_norm": 4.71875,
+      "learning_rate": 1.2718204488778054e-05,
+      "loss": 1.5462,
+      "mean_token_accuracy": 0.6424768209457398,
+      "num_tokens": 189798.0,
+      "step": 400
+    },
+    {
+      "epoch": 2.4550898203592815,
+      "grad_norm": 4.34375,
+      "learning_rate": 1.1471321695760599e-05,
+      "loss": 1.5468,
+      "mean_token_accuracy": 0.6386782228946686,
+      "num_tokens": 219194.0,
+      "step": 410
+    },
+    {
+      "epoch": 2.5149700598802394,
+      "grad_norm": 4.65625,
+      "learning_rate": 1.0224438902743143e-05,
+      "loss": 1.5713,
+      "mean_token_accuracy": 0.6371028661727905,
+      "num_tokens": 249445.0,
+      "step": 420
+    },
+    {
+      "epoch": 2.5748502994011977,
+      "grad_norm": 3.875,
+      "learning_rate": 8.977556109725686e-06,
+      "loss": 1.4073,
+      "mean_token_accuracy": 0.6741897523403168,
+      "num_tokens": 277096.0,
+      "step": 430
+    },
+    {
+      "epoch": 2.6347305389221556,
+      "grad_norm": 5.15625,
+      "learning_rate": 7.73067331670823e-06,
+      "loss": 1.5873,
+      "mean_token_accuracy": 0.6345809698104858,
+      "num_tokens": 306656.0,
+      "step": 440
+    },
+    {
+      "epoch": 2.694610778443114,
+      "grad_norm": 4.125,
+      "learning_rate": 6.483790523690773e-06,
+      "loss": 1.5398,
+      "mean_token_accuracy": 0.6446067214012146,
+      "num_tokens": 334989.0,
+      "step": 450
+    },
+    {
+      "epoch": 2.754491017964072,
+      "grad_norm": 4.78125,
+      "learning_rate": 5.236907730673317e-06,
+      "loss": 1.4324,
+      "mean_token_accuracy": 0.6648351371288299,
+      "num_tokens": 363393.0,
+      "step": 460
+    },
+    {
+      "epoch": 2.81437125748503,
+      "grad_norm": 4.375,
+      "learning_rate": 3.99002493765586e-06,
+      "loss": 1.4939,
+      "mean_token_accuracy": 0.6552098572254181,
+      "num_tokens": 392445.0,
+      "step": 470
+    },
+    {
+      "epoch": 2.874251497005988,
+      "grad_norm": 4.625,
+      "learning_rate": 2.743142144638404e-06,
+      "loss": 1.5451,
+      "mean_token_accuracy": 0.6401039361953735,
+      "num_tokens": 421549.0,
+      "step": 480
+    },
+    {
+      "epoch": 2.934131736526946,
+      "grad_norm": 4.9375,
+      "learning_rate": 1.4962593516209476e-06,
+      "loss": 1.5018,
+      "mean_token_accuracy": 0.6498535394668579,
+      "num_tokens": 450516.0,
+      "step": 490
+    },
+    {
+      "epoch": 2.9940119760479043,
+      "grad_norm": 4.8125,
+      "learning_rate": 2.4937655860349126e-07,
+      "loss": 1.502,
+      "mean_token_accuracy": 0.651291674375534,
+      "num_tokens": 480685.0,
+      "step": 500
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.9642236232757568,
+      "eval_mean_token_accuracy": 0.5623479668910687,
+      "eval_num_tokens": 482617.0,
+      "eval_runtime": 39.682,
+      "eval_samples_per_second": 2.52,
+      "eval_steps_per_second": 0.328,
+      "step": 501
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 501,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2647290262044672.0,
+  "train_batch_size": 3,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/last-checkpoint/training_args.bin
+++ b/last-checkpoint/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
+size 6161
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a72a583d2100bebe67877b3dddd468f2f42f288d51ae5f92982bccf3f89299a3
+size 708984464
--- a/runs/Apr11_09-00-16_d83477ed97e7/events.out.tfevents.1775898020.d83477ed97e7.5441.3
+++ b/runs/Apr11_09-00-16_d83477ed97e7/events.out.tfevents.1775898020.d83477ed97e7.5441.3
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecb847ee90726b4abe0c66ebdd31ab3934722a7e8815cb81d50774a1a4ceb1cd
+size 7001
--- a/runs/Apr11_09-02-18_d83477ed97e7/events.out.tfevents.1775898139.d83477ed97e7.5441.4
+++ b/runs/Apr11_09-02-18_d83477ed97e7/events.out.tfevents.1775898139.d83477ed97e7.5441.4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e703d3d3c34c61650f8af92ebae0b19efa998caec5e19fdce81da57cffb05dd
+size 12615
--- a/runs/Apr11_09-17-46_d83477ed97e7/events.out.tfevents.1775899068.d83477ed97e7.24672.0
+++ b/runs/Apr11_09-17-46_d83477ed97e7/events.out.tfevents.1775899068.d83477ed97e7.24672.0
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be2dd8537a167d5d53fdcc94b5dc73cf39193cd2884a775f0507c8f1e0e52112
+size 6029
--- a/runs/Apr11_09-19-29_d83477ed97e7/events.out.tfevents.1775899171.d83477ed97e7.24672.1
+++ b/runs/Apr11_09-19-29_d83477ed97e7/events.out.tfevents.1775899171.d83477ed97e7.24672.1
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98dbb1c20ba50409e061913a84d58f2edbc604b32147f47535808e53fbc9a8d3
+size 17977
--- a/runs/Apr11_09-19-29_d83477ed97e7/events.out.tfevents.1775900673.d83477ed97e7.24672.2
+++ b/runs/Apr11_09-19-29_d83477ed97e7/events.out.tfevents.1775900673.d83477ed97e7.24672.2
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17ab5aa730b700aa9329747cec3f3f2e444669c6922c393bc7b9f7ba2fc388e3
+size 6502
--- a/runs/Apr11_09-45-39_d83477ed97e7/events.out.tfevents.1775900754.d83477ed97e7.24672.3
+++ b/runs/Apr11_09-45-39_d83477ed97e7/events.out.tfevents.1775900754.d83477ed97e7.24672.3
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:717930e75e2e44e089ccb908f3dd2ae0efedc6cc94050105ee0c8b9167e13e70
+size 12381
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@@ -0,0 +1,23 @@
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83704ebef2b25c4db48557507116db40f180c5ce7b0b3cd17a50c1b82dd55055
+size 6161