初始化项目，由ModelHub XC社区提供模型

Model: fpadovani/tur_indomain_prepretraining_seed443 Source: Original Platform
2026-05-30 04:36:22 +08:00
commit d774fe1195
97 changed files with 2033223 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
--- a/README.md
+++ b/README.md
@@ -0,0 +1,58 @@
+---
+base_model: goldfish-models/tur_latn_10mb
+library_name: transformers
+model_name: tur_indomain_prepretraining_seed443
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+
+# Model Card for tur_indomain_prepretraining_seed443
+
+This model is a fine-tuned version of [goldfish-models/tur_latn_10mb](https://huggingface.co/goldfish-models/tur_latn_10mb).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+
+## Quick start
+
+```python
+from transformers import pipeline
+
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="fpadovani/tur_indomain_prepretraining_seed443", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+
+## Training procedure
+
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/f-padovani-university-of-groningen/formal_lang_tur/runs/pp9s9vhr) 
+
+
+This model was trained with SFT.
+
+### Framework versions
+
+- TRL: 0.13.0
+- Transformers: 4.47.0
+- Pytorch: 2.11.0
+- Datasets: 4.8.5
+- Tokenizers: 0.21.0
+
+## Citations
+
+
+
+Cite TRL as:
+    
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```
--- a/checkpoint-1000/config.json
+++ b/checkpoint-1000/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-1000/generation_config.json
+++ b/checkpoint-1000/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-1000/model.safetensors
+++ b/checkpoint-1000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d759f8edca2fd397c7c341622f6d6db65b259d40115be566357acccfe1180b7
+size 79752272
--- a/checkpoint-1000/optimizer.pt
+++ b/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57a47dcc8fd9802298e5ef35edc12889b807e4de4ce512e5dc89b69630ea8706
+size 159538443
--- a/checkpoint-1000/rng_state.pth
+++ b/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b6bdea08427f6f7e9655c7ffce053c58896d7e3be7bc5ff758191983c3fab18
+size 14645
--- a/checkpoint-1000/scheduler.pt
+++ b/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49b5d602a36644e60adc641ff93b54a00173a84964edb693fdb6600fe5162714
+size 1465
--- a/checkpoint-1000/special_tokens_map.json
+++ b/checkpoint-1000/special_tokens_map.json
--- a/checkpoint-1000/tokenizer.json
+++ b/checkpoint-1000/tokenizer.json
--- a/checkpoint-1000/tokenizer_config.json
+++ b/checkpoint-1000/tokenizer_config.json
--- a/checkpoint-1000/trainer_state.json
+++ b/checkpoint-1000/trainer_state.json
--- a/checkpoint-1000/training_args.bin
+++ b/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-1500/config.json
+++ b/checkpoint-1500/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-1500/generation_config.json
+++ b/checkpoint-1500/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-1500/model.safetensors
+++ b/checkpoint-1500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc62649876a24d8523dea8a5a4f38e38fb265d13fedb0e000c4d93e8e7f3401
+size 79752272
--- a/checkpoint-1500/optimizer.pt
+++ b/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10a3e995bc0a215d0fc53c5364682b1aee2d53bc4e1073e48a62926d52f64c71
+size 159538443
--- a/checkpoint-1500/rng_state.pth
+++ b/checkpoint-1500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:765b0d7858091d7ba87c8b5b857d0c75a24b9ba3e1217688224015760db749aa
+size 14645
--- a/checkpoint-1500/scheduler.pt
+++ b/checkpoint-1500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdf706bbb7bb6e14789b8a64f9da210a0ebd06388777b0fcde0c23247b8fa573
+size 1465
--- a/checkpoint-1500/special_tokens_map.json
+++ b/checkpoint-1500/special_tokens_map.json
--- a/checkpoint-1500/tokenizer.json
+++ b/checkpoint-1500/tokenizer.json
--- a/checkpoint-1500/tokenizer_config.json
+++ b/checkpoint-1500/tokenizer_config.json
--- a/checkpoint-1500/trainer_state.json
+++ b/checkpoint-1500/trainer_state.json
--- a/checkpoint-1500/training_args.bin
+++ b/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-2000/config.json
+++ b/checkpoint-2000/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-2000/generation_config.json
+++ b/checkpoint-2000/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-2000/model.safetensors
+++ b/checkpoint-2000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:943b0241960b17a4a3ecb6c5126c5d109bcf763a10e573225439c642b42e0066
+size 79752272
--- a/checkpoint-2000/optimizer.pt
+++ b/checkpoint-2000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e79586aba1661447ac3756eac1e5bfd1dfdf54c9ba9400036b70565ef4814c33
+size 159538443
--- a/checkpoint-2000/rng_state.pth
+++ b/checkpoint-2000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cfcd296e9abdf8a7b5a10397cfc9267944fedf70c8733ce64ad88e9daccdbe8
+size 14645
--- a/checkpoint-2000/scheduler.pt
+++ b/checkpoint-2000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:012c90d7670b2bb998827d13327927d9a7c2889b9259d1469026aac83c30468b
+size 1465
--- a/checkpoint-2000/special_tokens_map.json
+++ b/checkpoint-2000/special_tokens_map.json
--- a/checkpoint-2000/tokenizer.json
+++ b/checkpoint-2000/tokenizer.json
--- a/checkpoint-2000/tokenizer_config.json
+++ b/checkpoint-2000/tokenizer_config.json
--- a/checkpoint-2000/trainer_state.json
+++ b/checkpoint-2000/trainer_state.json
--- a/checkpoint-2000/training_args.bin
+++ b/checkpoint-2000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-2500/config.json
+++ b/checkpoint-2500/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-2500/generation_config.json
+++ b/checkpoint-2500/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-2500/model.safetensors
+++ b/checkpoint-2500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e660ea3b9087bf515ef59a7502e491c262827c37fb79b8108da0830f71397df5
+size 79752272
--- a/checkpoint-2500/optimizer.pt
+++ b/checkpoint-2500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a42c5a9493833024fd51018a8a49af1f2cea0e9645dd68b0cf0fe2331de2b13
+size 159538443
--- a/checkpoint-2500/rng_state.pth
+++ b/checkpoint-2500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:84eb9f55b9e425120c0f0c6f7e2475a0f5d883dcf62ac5b50b34d84ca2d8482f
+size 14645
--- a/checkpoint-2500/scheduler.pt
+++ b/checkpoint-2500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6b3ad5549d5099e5cee60aa161fa531e77bd60d13a0dc32433f3e6e01a85319
+size 1465
--- a/checkpoint-2500/special_tokens_map.json
+++ b/checkpoint-2500/special_tokens_map.json
--- a/checkpoint-2500/tokenizer.json
+++ b/checkpoint-2500/tokenizer.json
--- a/checkpoint-2500/tokenizer_config.json
+++ b/checkpoint-2500/tokenizer_config.json
--- a/checkpoint-2500/trainer_state.json
+++ b/checkpoint-2500/trainer_state.json
--- a/checkpoint-2500/training_args.bin
+++ b/checkpoint-2500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-3000/config.json
+++ b/checkpoint-3000/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-3000/generation_config.json
+++ b/checkpoint-3000/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-3000/model.safetensors
+++ b/checkpoint-3000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:20b44ad82ef3b8a0e0eb3e3f8d1f7793287ea46d8bba12fcf95b8ca6c1ffd594
+size 79752272
--- a/checkpoint-3000/optimizer.pt
+++ b/checkpoint-3000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff8da7216b0e6e865fe569f2bce465ece4022e3b28adf9b00e35191b5578f02b
+size 159538443
--- a/checkpoint-3000/rng_state.pth
+++ b/checkpoint-3000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87a90a678107058f0f57bb547f609405f9273aaec379df390433f706635743c6
+size 14645
--- a/checkpoint-3000/scheduler.pt
+++ b/checkpoint-3000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dc70e9e1c1fef46d69e152f6d8d91fd0c8ce65b2fe905d341de3ee92357d097
+size 1465
--- a/checkpoint-3000/special_tokens_map.json
+++ b/checkpoint-3000/special_tokens_map.json
--- a/checkpoint-3000/tokenizer.json
+++ b/checkpoint-3000/tokenizer.json
--- a/checkpoint-3000/tokenizer_config.json
+++ b/checkpoint-3000/tokenizer_config.json
--- a/checkpoint-3000/trainer_state.json
+++ b/checkpoint-3000/trainer_state.json
--- a/checkpoint-3000/training_args.bin
+++ b/checkpoint-3000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-3500/config.json
+++ b/checkpoint-3500/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-3500/generation_config.json
+++ b/checkpoint-3500/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-3500/model.safetensors
+++ b/checkpoint-3500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec7a678bfbea3b3042b54084c849c86842e6847b27ddb7f1a3773bc774a63467
+size 79752272
--- a/checkpoint-3500/optimizer.pt
+++ b/checkpoint-3500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c21529a9cd0af04b4bdfb1e5e1366ed6725cd78baeed855b556093969fa5e26e
+size 159538443
--- a/checkpoint-3500/rng_state.pth
+++ b/checkpoint-3500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64e6b0bcc54eddddbb87b095bde4156a7e7612af5bfc9eeafffa9273fe348166
+size 14645
--- a/checkpoint-3500/scheduler.pt
+++ b/checkpoint-3500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f6d9a470b97058ccd4f7c214a2a15352732f3be5686d8ebcb2c85f286ab0b593
+size 1465
--- a/checkpoint-3500/special_tokens_map.json
+++ b/checkpoint-3500/special_tokens_map.json
--- a/checkpoint-3500/tokenizer.json
+++ b/checkpoint-3500/tokenizer.json
--- a/checkpoint-3500/tokenizer_config.json
+++ b/checkpoint-3500/tokenizer_config.json
--- a/checkpoint-3500/trainer_state.json
+++ b/checkpoint-3500/trainer_state.json
--- a/checkpoint-3500/training_args.bin
+++ b/checkpoint-3500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-4000/config.json
+++ b/checkpoint-4000/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-4000/generation_config.json
+++ b/checkpoint-4000/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-4000/model.safetensors
+++ b/checkpoint-4000/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61b5dd841979da98c7ad0f596821d61d9746c9c2ecc8dcfb868422d9bcb393f1
+size 79752272
--- a/checkpoint-4000/optimizer.pt
+++ b/checkpoint-4000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:07626b6f7e42c0abafdcda599f2d64e3497f51c63ba203569db1a303dfc4a0bf
+size 159538443
--- a/checkpoint-4000/rng_state.pth
+++ b/checkpoint-4000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1aa1a59eb1c529fd3de41a5fad25b240e8ac03225c5e6ea6ffd55cb52e647fe
+size 14645
--- a/checkpoint-4000/scheduler.pt
+++ b/checkpoint-4000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a59cc45fece372941e082c6cabb8944243288549677e1e376f4e0b1009f0e7ad
+size 1465
--- a/checkpoint-4000/special_tokens_map.json
+++ b/checkpoint-4000/special_tokens_map.json
--- a/checkpoint-4000/tokenizer.json
+++ b/checkpoint-4000/tokenizer.json
--- a/checkpoint-4000/tokenizer_config.json
+++ b/checkpoint-4000/tokenizer_config.json
--- a/checkpoint-4000/trainer_state.json
+++ b/checkpoint-4000/trainer_state.json
--- a/checkpoint-4000/training_args.bin
+++ b/checkpoint-4000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/checkpoint-500/config.json
+++ b/checkpoint-500/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/checkpoint-500/generation_config.json
+++ b/checkpoint-500/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/checkpoint-500/model.safetensors
+++ b/checkpoint-500/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:988b217926880491312a2f24c1950372b3beab48753200a5fa6ed38e1997da72
+size 79752272
--- a/checkpoint-500/optimizer.pt
+++ b/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e8b5460df615fa54ce3f9cb331dddcd6331eb66cb74d0c18026ed0fcba91f1ec
+size 159538443
--- a/checkpoint-500/rng_state.pth
+++ b/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b65c0962ff038a38b6e94677564291e50dd476018832f6add34bd1e751ea3a86
+size 14645
--- a/checkpoint-500/scheduler.pt
+++ b/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f243d020d2631ea115c3f5d3e6c1e5c9a7f8334b45c5f10056444d8c3da615c
+size 1465
--- a/checkpoint-500/special_tokens_map.json
+++ b/checkpoint-500/special_tokens_map.json
--- a/checkpoint-500/tokenizer.json
+++ b/checkpoint-500/tokenizer.json
--- a/checkpoint-500/tokenizer_config.json
+++ b/checkpoint-500/tokenizer_config.json
--- a/checkpoint-500/trainer_state.json
+++ b/checkpoint-500/trainer_state.json
@@ -0,0 +1,733 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.0653893938403191,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000653893938403191,
+      "grad_norm": 8.0,
+      "learning_rate": 1e-05,
+      "loss": 10.9235,
+      "step": 5
+    },
+    {
+      "epoch": 0.001307787876806382,
+      "grad_norm": 7.3125,
+      "learning_rate": 2e-05,
+      "loss": 10.8446,
+      "step": 10
+    },
+    {
+      "epoch": 0.001961681815209573,
+      "grad_norm": 5.1875,
+      "learning_rate": 3e-05,
+      "loss": 10.6184,
+      "step": 15
+    },
+    {
+      "epoch": 0.002615575753612764,
+      "grad_norm": 3.453125,
+      "learning_rate": 4e-05,
+      "loss": 10.4354,
+      "step": 20
+    },
+    {
+      "epoch": 0.003269469692015955,
+      "grad_norm": 3.359375,
+      "learning_rate": 5e-05,
+      "loss": 10.2832,
+      "step": 25
+    },
+    {
+      "epoch": 0.003923363630419146,
+      "grad_norm": 2.90625,
+      "learning_rate": 6e-05,
+      "loss": 10.1894,
+      "step": 30
+    },
+    {
+      "epoch": 0.004577257568822337,
+      "grad_norm": 2.984375,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 10.0521,
+      "step": 35
+    },
+    {
+      "epoch": 0.005231151507225528,
+      "grad_norm": 2.5625,
+      "learning_rate": 8e-05,
+      "loss": 9.9261,
+      "step": 40
+    },
+    {
+      "epoch": 0.005885045445628719,
+      "grad_norm": 2.59375,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 9.7805,
+      "step": 45
+    },
+    {
+      "epoch": 0.00653893938403191,
+      "grad_norm": 2.515625,
+      "learning_rate": 0.0001,
+      "loss": 9.5316,
+      "step": 50
+    },
+    {
+      "epoch": 0.007192833322435101,
+      "grad_norm": 2.109375,
+      "learning_rate": 0.00011,
+      "loss": 9.3912,
+      "step": 55
+    },
+    {
+      "epoch": 0.007846727260838291,
+      "grad_norm": 2.015625,
+      "learning_rate": 0.00012,
+      "loss": 9.2058,
+      "step": 60
+    },
+    {
+      "epoch": 0.008500621199241483,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 9.0564,
+      "step": 65
+    },
+    {
+      "epoch": 0.009154515137644674,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 8.8344,
+      "step": 70
+    },
+    {
+      "epoch": 0.009808409076047865,
+      "grad_norm": 1.375,
+      "learning_rate": 0.00015,
+      "loss": 8.6926,
+      "step": 75
+    },
+    {
+      "epoch": 0.010462303014451056,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.00016,
+      "loss": 8.5817,
+      "step": 80
+    },
+    {
+      "epoch": 0.011116196952854247,
+      "grad_norm": 1.171875,
+      "learning_rate": 0.00017,
+      "loss": 8.4945,
+      "step": 85
+    },
+    {
+      "epoch": 0.011770090891257438,
+      "grad_norm": 1.0859375,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 8.4975,
+      "step": 90
+    },
+    {
+      "epoch": 0.01242398482966063,
+      "grad_norm": 1.2734375,
+      "learning_rate": 0.00019,
+      "loss": 8.4394,
+      "step": 95
+    },
+    {
+      "epoch": 0.01307787876806382,
+      "grad_norm": 1.359375,
+      "learning_rate": 0.0002,
+      "loss": 8.4332,
+      "step": 100
+    },
+    {
+      "epoch": 0.013731772706467011,
+      "grad_norm": 1.9375,
+      "learning_rate": 0.00021,
+      "loss": 8.4649,
+      "step": 105
+    },
+    {
+      "epoch": 0.014385666644870202,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00022,
+      "loss": 8.408,
+      "step": 110
+    },
+    {
+      "epoch": 0.015039560583273394,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00023,
+      "loss": 8.3359,
+      "step": 115
+    },
+    {
+      "epoch": 0.015693454521676583,
+      "grad_norm": 1.3671875,
+      "learning_rate": 0.00024,
+      "loss": 8.3483,
+      "step": 120
+    },
+    {
+      "epoch": 0.016347348460079774,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00025,
+      "loss": 8.3056,
+      "step": 125
+    },
+    {
+      "epoch": 0.017001242398482965,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 8.3724,
+      "step": 130
+    },
+    {
+      "epoch": 0.017655136336886156,
+      "grad_norm": 2.171875,
+      "learning_rate": 0.00027,
+      "loss": 8.3476,
+      "step": 135
+    },
+    {
+      "epoch": 0.018309030275289347,
+      "grad_norm": 1.421875,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 8.2929,
+      "step": 140
+    },
+    {
+      "epoch": 0.01896292421369254,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00029,
+      "loss": 8.2493,
+      "step": 145
+    },
+    {
+      "epoch": 0.01961681815209573,
+      "grad_norm": 1.3125,
+      "learning_rate": 0.0003,
+      "loss": 8.291,
+      "step": 150
+    },
+    {
+      "epoch": 0.02027071209049892,
+      "grad_norm": 1.3046875,
+      "learning_rate": 0.00031,
+      "loss": 8.1787,
+      "step": 155
+    },
+    {
+      "epoch": 0.02092460602890211,
+      "grad_norm": 1.4375,
+      "learning_rate": 0.00032,
+      "loss": 8.2745,
+      "step": 160
+    },
+    {
+      "epoch": 0.021578499967305303,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00033,
+      "loss": 8.1676,
+      "step": 165
+    },
+    {
+      "epoch": 0.022232393905708494,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00034,
+      "loss": 8.1576,
+      "step": 170
+    },
+    {
+      "epoch": 0.022886287844111685,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00035,
+      "loss": 8.1338,
+      "step": 175
+    },
+    {
+      "epoch": 0.023540181782514876,
+      "grad_norm": 1.34375,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 8.0981,
+      "step": 180
+    },
+    {
+      "epoch": 0.024194075720918067,
+      "grad_norm": 1.8515625,
+      "learning_rate": 0.00037,
+      "loss": 8.1553,
+      "step": 185
+    },
+    {
+      "epoch": 0.02484796965932126,
+      "grad_norm": 2.0625,
+      "learning_rate": 0.00038,
+      "loss": 8.0505,
+      "step": 190
+    },
+    {
+      "epoch": 0.02550186359772445,
+      "grad_norm": 1.4140625,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 8.0276,
+      "step": 195
+    },
+    {
+      "epoch": 0.02615575753612764,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.0004,
+      "loss": 8.026,
+      "step": 200
+    },
+    {
+      "epoch": 0.02680965147453083,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.00041,
+      "loss": 8.0615,
+      "step": 205
+    },
+    {
+      "epoch": 0.027463545412934023,
+      "grad_norm": 1.6171875,
+      "learning_rate": 0.00042,
+      "loss": 8.0192,
+      "step": 210
+    },
+    {
+      "epoch": 0.028117439351337214,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.00043,
+      "loss": 7.9596,
+      "step": 215
+    },
+    {
+      "epoch": 0.028771333289740405,
+      "grad_norm": 1.484375,
+      "learning_rate": 0.00044,
+      "loss": 7.9729,
+      "step": 220
+    },
+    {
+      "epoch": 0.029425227228143596,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 7.9676,
+      "step": 225
+    },
+    {
+      "epoch": 0.030079121166546787,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.00046,
+      "loss": 7.9347,
+      "step": 230
+    },
+    {
+      "epoch": 0.030733015104949978,
+      "grad_norm": 1.875,
+      "learning_rate": 0.00047,
+      "loss": 7.9303,
+      "step": 235
+    },
+    {
+      "epoch": 0.031386909043353166,
+      "grad_norm": 1.671875,
+      "learning_rate": 0.00048,
+      "loss": 7.9393,
+      "step": 240
+    },
+    {
+      "epoch": 0.03204080298175636,
+      "grad_norm": 1.59375,
+      "learning_rate": 0.00049,
+      "loss": 7.8751,
+      "step": 245
+    },
+    {
+      "epoch": 0.03269469692015955,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0005,
+      "loss": 7.9601,
+      "step": 250
+    },
+    {
+      "epoch": 0.03334859085856274,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.00051,
+      "loss": 7.8759,
+      "step": 255
+    },
+    {
+      "epoch": 0.03400248479696593,
+      "grad_norm": 1.6796875,
+      "learning_rate": 0.0005200000000000001,
+      "loss": 7.8174,
+      "step": 260
+    },
+    {
+      "epoch": 0.03465637873536912,
+      "grad_norm": 1.4609375,
+      "learning_rate": 0.0005300000000000001,
+      "loss": 7.8309,
+      "step": 265
+    },
+    {
+      "epoch": 0.03531027267377231,
+      "grad_norm": 1.90625,
+      "learning_rate": 0.00054,
+      "loss": 7.8638,
+      "step": 270
+    },
+    {
+      "epoch": 0.035964166612175504,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00055,
+      "loss": 7.8285,
+      "step": 275
+    },
+    {
+      "epoch": 0.036618060550578695,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0005600000000000001,
+      "loss": 7.8669,
+      "step": 280
+    },
+    {
+      "epoch": 0.037271954488981886,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00057,
+      "loss": 7.8088,
+      "step": 285
+    },
+    {
+      "epoch": 0.03792584842738508,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00058,
+      "loss": 7.7459,
+      "step": 290
+    },
+    {
+      "epoch": 0.03857974236578827,
+      "grad_norm": 1.6015625,
+      "learning_rate": 0.00059,
+      "loss": 7.7809,
+      "step": 295
+    },
+    {
+      "epoch": 0.03923363630419146,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0006,
+      "loss": 7.7242,
+      "step": 300
+    },
+    {
+      "epoch": 0.03988753024259465,
+      "grad_norm": 1.5625,
+      "learning_rate": 0.00061,
+      "loss": 7.7242,
+      "step": 305
+    },
+    {
+      "epoch": 0.04054142418099784,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00062,
+      "loss": 7.7411,
+      "step": 310
+    },
+    {
+      "epoch": 0.04119531811940103,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.00063,
+      "loss": 7.6911,
+      "step": 315
+    },
+    {
+      "epoch": 0.04184921205780422,
+      "grad_norm": 1.4296875,
+      "learning_rate": 0.00064,
+      "loss": 7.6351,
+      "step": 320
+    },
+    {
+      "epoch": 0.042503105996207415,
+      "grad_norm": 1.5078125,
+      "learning_rate": 0.0006500000000000001,
+      "loss": 7.6582,
+      "step": 325
+    },
+    {
+      "epoch": 0.043156999934610606,
+      "grad_norm": 1.6640625,
+      "learning_rate": 0.00066,
+      "loss": 7.6329,
+      "step": 330
+    },
+    {
+      "epoch": 0.0438108938730138,
+      "grad_norm": 1.6875,
+      "learning_rate": 0.00067,
+      "loss": 7.6285,
+      "step": 335
+    },
+    {
+      "epoch": 0.04446478781141699,
+      "grad_norm": 1.5,
+      "learning_rate": 0.00068,
+      "loss": 7.6687,
+      "step": 340
+    },
+    {
+      "epoch": 0.04511868174982018,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00069,
+      "loss": 7.6195,
+      "step": 345
+    },
+    {
+      "epoch": 0.04577257568822337,
+      "grad_norm": 1.5390625,
+      "learning_rate": 0.0007,
+      "loss": 7.5768,
+      "step": 350
+    },
+    {
+      "epoch": 0.04642646962662656,
+      "grad_norm": 1.9609375,
+      "learning_rate": 0.00071,
+      "loss": 7.6513,
+      "step": 355
+    },
+    {
+      "epoch": 0.04708036356502975,
+      "grad_norm": 1.6953125,
+      "learning_rate": 0.0007199999999999999,
+      "loss": 7.587,
+      "step": 360
+    },
+    {
+      "epoch": 0.04773425750343294,
+      "grad_norm": 1.9765625,
+      "learning_rate": 0.00073,
+      "loss": 7.5975,
+      "step": 365
+    },
+    {
+      "epoch": 0.048388151441836134,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00074,
+      "loss": 7.565,
+      "step": 370
+    },
+    {
+      "epoch": 0.049042045380239326,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00075,
+      "loss": 7.5604,
+      "step": 375
+    },
+    {
+      "epoch": 0.04969593931864252,
+      "grad_norm": 1.515625,
+      "learning_rate": 0.00076,
+      "loss": 7.5916,
+      "step": 380
+    },
+    {
+      "epoch": 0.05034983325704571,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0007700000000000001,
+      "loss": 7.5601,
+      "step": 385
+    },
+    {
+      "epoch": 0.0510037271954489,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.0007800000000000001,
+      "loss": 7.5528,
+      "step": 390
+    },
+    {
+      "epoch": 0.05165762113385209,
+      "grad_norm": 1.7109375,
+      "learning_rate": 0.00079,
+      "loss": 7.5534,
+      "step": 395
+    },
+    {
+      "epoch": 0.05231151507225528,
+      "grad_norm": 1.4765625,
+      "learning_rate": 0.0008,
+      "loss": 7.5243,
+      "step": 400
+    },
+    {
+      "epoch": 0.05296540901065847,
+      "grad_norm": 1.5703125,
+      "learning_rate": 0.0008100000000000001,
+      "loss": 7.5231,
+      "step": 405
+    },
+    {
+      "epoch": 0.05361930294906166,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00082,
+      "loss": 7.5174,
+      "step": 410
+    },
+    {
+      "epoch": 0.054273196887464854,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00083,
+      "loss": 7.4922,
+      "step": 415
+    },
+    {
+      "epoch": 0.054927090825868045,
+      "grad_norm": 1.6484375,
+      "learning_rate": 0.00084,
+      "loss": 7.3765,
+      "step": 420
+    },
+    {
+      "epoch": 0.055580984764271237,
+      "grad_norm": 1.609375,
+      "learning_rate": 0.00085,
+      "loss": 7.4035,
+      "step": 425
+    },
+    {
+      "epoch": 0.05623487870267443,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00086,
+      "loss": 7.4614,
+      "step": 430
+    },
+    {
+      "epoch": 0.05688877264107762,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.00087,
+      "loss": 7.4449,
+      "step": 435
+    },
+    {
+      "epoch": 0.05754266657948081,
+      "grad_norm": 1.5859375,
+      "learning_rate": 0.00088,
+      "loss": 7.5239,
+      "step": 440
+    },
+    {
+      "epoch": 0.058196560517884,
+      "grad_norm": 1.5,
+      "learning_rate": 0.0008900000000000001,
+      "loss": 7.3661,
+      "step": 445
+    },
+    {
+      "epoch": 0.05885045445628719,
+      "grad_norm": 1.53125,
+      "learning_rate": 0.0009000000000000001,
+      "loss": 7.3652,
+      "step": 450
+    },
+    {
+      "epoch": 0.05950434839469038,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.00091,
+      "loss": 7.4256,
+      "step": 455
+    },
+    {
+      "epoch": 0.060158242333093574,
+      "grad_norm": 1.5546875,
+      "learning_rate": 0.00092,
+      "loss": 7.4048,
+      "step": 460
+    },
+    {
+      "epoch": 0.060812136271496765,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00093,
+      "loss": 7.4004,
+      "step": 465
+    },
+    {
+      "epoch": 0.061466030209899956,
+      "grad_norm": 1.390625,
+      "learning_rate": 0.00094,
+      "loss": 7.3547,
+      "step": 470
+    },
+    {
+      "epoch": 0.06211992414830315,
+      "grad_norm": 1.5234375,
+      "learning_rate": 0.00095,
+      "loss": 7.4078,
+      "step": 475
+    },
+    {
+      "epoch": 0.06277381808670633,
+      "grad_norm": 1.46875,
+      "learning_rate": 0.00096,
+      "loss": 7.3878,
+      "step": 480
+    },
+    {
+      "epoch": 0.06342771202510952,
+      "grad_norm": 1.453125,
+      "learning_rate": 0.0009699999999999999,
+      "loss": 7.3434,
+      "step": 485
+    },
+    {
+      "epoch": 0.06408160596351271,
+      "grad_norm": 1.625,
+      "learning_rate": 0.00098,
+      "loss": 7.3696,
+      "step": 490
+    },
+    {
+      "epoch": 0.0647354999019159,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.00099,
+      "loss": 7.3436,
+      "step": 495
+    },
+    {
+      "epoch": 0.0653893938403191,
+      "grad_norm": 1.4921875,
+      "learning_rate": 0.001,
+      "loss": 7.2601,
+      "step": 500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 670418530467840.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/checkpoint-500/training_args.bin
+++ b/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161
--- a/config.json
+++ b/config.json
@@ -0,0 +1,35 @@
+{
+  "_name_or_path": "goldfish-models/tur_latn_10mb",
+  "activation_function": "gelu",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50000,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50001,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 2048,
+  "n_embd": 512,
+  "n_head": 8,
+  "n_inner": 2048,
+  "n_layer": 4,
+  "n_positions": 2048,
+  "pad_token_id": 50002,
+  "prefix": "[CLS]",
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.0",
+  "use_cache": true,
+  "vocab_size": 51200
+}
--- a/generation_config.json
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 50000,
+  "eos_token_id": 50001,
+  "pad_token_id": 50002,
+  "transformers_version": "4.47.0"
+}
--- a/model.safetensors
+++ b/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61b5dd841979da98c7ad0f596821d61d9746c9c2ecc8dcfb868422d9bcb393f1
+size 79752272
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
--- a/tokenizer.json
+++ b/tokenizer.json
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
--- a/training_args.bin
+++ b/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dc7942391c97f5c71d525af067d82cf56a6d2234bb905634d507c9b596fea7c
+size 6161