From e3b4cb1fe1a10b56361dc40b20905fba463c8c3b Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Fri, 24 Apr 2026 21:13:03 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: codingmonster1234/chess-sft-modelv2 Source: Original Platform --- .gitattributes | 36 + README.md | 58 ++ chat_template.jinja | 61 ++ config.json | 71 ++ generation_config.json | 12 + model.safetensors | 3 + optimizer.pt | 3 + rng_state.pth | 3 + scheduler.pt | 3 + tokenizer.json | 3 + tokenizer_config.json | 239 ++++++ trainer_state.json | 1738 ++++++++++++++++++++++++++++++++++++++++ training_args.bin | 3 + 13 files changed, 2233 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model.safetensors create mode 100644 optimizer.pt create mode 100644 rng_state.pth create mode 100644 scheduler.pt create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..35d801f --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +--- +base_model: Qwen/Qwen3-4B-Instruct-2507 +library_name: transformers +model_name: output-Qwen3-4B-Instruct-2507 +tags: +- generated_from_trainer +- sft +- trl +licence: license +--- + +# Model Card for output-Qwen3-4B-Instruct-2507 + +This model is a fine-tuned version of [Qwen/Qwen3-4B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/easwar-chess-none/chess-reasoning-v1/runs/aknx54sd) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.29.1 +- Transformers: 5.4.0 +- Pytorch: 2.11.0 +- Datasets: 4.8.4 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..70adff8 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,61 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..94b1c92 --- /dev/null +++ b/config.json @@ -0,0 +1,71 @@ +{ + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 36, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 5000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "5.4.0", + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..adfee01 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "5.4.0" +} diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..ceb43c5 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db62c16fc7e72b520780f950405f0befffc05eb21dea3b0429c54aaa0fac5f63 +size 8044982080 diff --git a/optimizer.pt b/optimizer.pt new file mode 100644 index 0000000..62a14e2 --- /dev/null +++ b/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:188874149b517f101f308cac71f71874db6f1c5e40d45d31f15d3a3ad0c7a240 +size 16090225449 diff --git a/rng_state.pth b/rng_state.pth new file mode 100644 index 0000000..e92cf8f --- /dev/null +++ b/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e +size 14645 diff --git a/scheduler.pt b/scheduler.pt new file mode 100644 index 0000000..7759360 --- /dev/null +++ b/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc24cc580d6b93c8a95bc57b42299118656280a2fcbfb6854e41b57414837d71 +size 1465 diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..c7afbed --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506 +size 11422650 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..51c1be0 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,239 @@ +{ + "add_prefix_space": false, + "added_tokens_decoder": { + "151643": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151645": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151646": { + "content": "<|object_ref_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|object_ref_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151648": { + "content": "<|box_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151649": { + "content": "<|box_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151665": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151666": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151667": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151668": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|object_ref_start|>", + "<|object_ref_end|>", + "<|box_start|>", + "<|box_end|>", + "<|quad_start|>", + "<|quad_end|>", + "<|vision_start|>", + "<|vision_end|>", + "<|vision_pad|>", + "<|image_pad|>", + "<|video_pad|>" + ], + "bos_token": null, + "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|im_end|>", + "errors": "replace", + "model_max_length": 1010000, + "pad_token": "<|endoftext|>", + "split_special_tokens": false, + "tokenizer_class": "Qwen2Tokenizer", + "unk_token": null, + "add_bos_token": false +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..fdf340b --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,1738 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.0061692222952843, + "epoch": 0.011904761904761904, + "grad_norm": 3.390625, + "learning_rate": 2e-05, + "loss": 2.293125867843628, + "mean_token_accuracy": 0.5738132819533348, + "num_tokens": 29832.0, + "step": 1 + }, + { + "entropy": 1.385195016860962, + "epoch": 0.023809523809523808, + "grad_norm": 0.78125, + "learning_rate": 1.999922292480975e-05, + "loss": 1.5697591304779053, + "mean_token_accuracy": 0.6427712365984917, + "num_tokens": 58835.0, + "step": 2 + }, + { + "entropy": 1.5784537345170975, + "epoch": 0.03571428571428571, + "grad_norm": 0.62890625, + "learning_rate": 1.9996891820008165e-05, + "loss": 1.5061622858047485, + "mean_token_accuracy": 0.654805600643158, + "num_tokens": 88089.0, + "step": 3 + }, + { + "entropy": 1.5019408017396927, + "epoch": 0.047619047619047616, + "grad_norm": 0.43359375, + "learning_rate": 1.9993007047883988e-05, + "loss": 1.3531173467636108, + "mean_token_accuracy": 0.6810621172189713, + "num_tokens": 116996.0, + "step": 4 + }, + { + "entropy": 1.442432388663292, + "epoch": 0.05952380952380952, + "grad_norm": 0.369140625, + "learning_rate": 1.9987569212189224e-05, + "loss": 1.2870382070541382, + "mean_token_accuracy": 0.6946646422147751, + "num_tokens": 146502.0, + "step": 5 + }, + { + "entropy": 1.383298322558403, + "epoch": 0.07142857142857142, + "grad_norm": 0.318359375, + "learning_rate": 1.9980579158045322e-05, + "loss": 1.2606914043426514, + "mean_token_accuracy": 0.6914810612797737, + "num_tokens": 175000.0, + "step": 6 + }, + { + "entropy": 1.3554321229457855, + "epoch": 0.08333333333333333, + "grad_norm": 0.359375, + "learning_rate": 1.9972037971811802e-05, + "loss": 1.2325180768966675, + "mean_token_accuracy": 0.6992553323507309, + "num_tokens": 203581.0, + "step": 7 + }, + { + "entropy": 1.301919937133789, + "epoch": 0.09523809523809523, + "grad_norm": 0.3046875, + "learning_rate": 1.9961946980917457e-05, + "loss": 1.1691060066223145, + "mean_token_accuracy": 0.714451938867569, + "num_tokens": 233225.0, + "step": 8 + }, + { + "entropy": 1.3274528235197067, + "epoch": 0.10714285714285714, + "grad_norm": 0.296875, + "learning_rate": 1.9950307753654016e-05, + "loss": 1.22238290309906, + "mean_token_accuracy": 0.6991388499736786, + "num_tokens": 261557.0, + "step": 9 + }, + { + "entropy": 1.3020492941141129, + "epoch": 0.11904761904761904, + "grad_norm": 0.279296875, + "learning_rate": 1.9937122098932428e-05, + "loss": 1.1407413482666016, + "mean_token_accuracy": 0.7115657702088356, + "num_tokens": 290843.0, + "step": 10 + }, + { + "entropy": 1.2911820262670517, + "epoch": 0.13095238095238096, + "grad_norm": 0.263671875, + "learning_rate": 1.9922392066001724e-05, + "loss": 1.1007871627807617, + "mean_token_accuracy": 0.7251745313405991, + "num_tokens": 320963.0, + "step": 11 + }, + { + "entropy": 1.305821493268013, + "epoch": 0.14285714285714285, + "grad_norm": 0.2890625, + "learning_rate": 1.9906119944130527e-05, + "loss": 1.0885382890701294, + "mean_token_accuracy": 0.7273061871528625, + "num_tokens": 350648.0, + "step": 12 + }, + { + "entropy": 1.3162220120429993, + "epoch": 0.15476190476190477, + "grad_norm": 0.265625, + "learning_rate": 1.9888308262251286e-05, + "loss": 1.0963213443756104, + "mean_token_accuracy": 0.7211973443627357, + "num_tokens": 380096.0, + "step": 13 + }, + { + "entropy": 1.3141592741012573, + "epoch": 0.16666666666666666, + "grad_norm": 0.2734375, + "learning_rate": 1.9868959788567213e-05, + "loss": 1.0897754430770874, + "mean_token_accuracy": 0.7258400693535805, + "num_tokens": 407435.0, + "step": 14 + }, + { + "entropy": 1.3073242455720901, + "epoch": 0.17857142857142858, + "grad_norm": 0.2578125, + "learning_rate": 1.9848077530122083e-05, + "loss": 1.0494160652160645, + "mean_token_accuracy": 0.7293207123875618, + "num_tokens": 435734.0, + "step": 15 + }, + { + "entropy": 1.3367096036672592, + "epoch": 0.19047619047619047, + "grad_norm": 0.255859375, + "learning_rate": 1.9825664732332886e-05, + "loss": 1.1211317777633667, + "mean_token_accuracy": 0.7143202275037766, + "num_tokens": 464973.0, + "step": 16 + }, + { + "entropy": 1.3097643703222275, + "epoch": 0.20238095238095238, + "grad_norm": 0.25, + "learning_rate": 1.9801724878485438e-05, + "loss": 1.0753662586212158, + "mean_token_accuracy": 0.7259641215205193, + "num_tokens": 493135.0, + "step": 17 + }, + { + "entropy": 1.2622702419757843, + "epoch": 0.21428571428571427, + "grad_norm": 0.232421875, + "learning_rate": 1.977626168919305e-05, + "loss": 1.007223129272461, + "mean_token_accuracy": 0.744126707315445, + "num_tokens": 522656.0, + "step": 18 + }, + { + "entropy": 1.2859665155410767, + "epoch": 0.2261904761904762, + "grad_norm": 0.23046875, + "learning_rate": 1.9749279121818235e-05, + "loss": 1.0457340478897095, + "mean_token_accuracy": 0.7328037023544312, + "num_tokens": 551875.0, + "step": 19 + }, + { + "entropy": 1.275212675333023, + "epoch": 0.23809523809523808, + "grad_norm": 0.251953125, + "learning_rate": 1.9720781369857747e-05, + "loss": 1.0395888090133667, + "mean_token_accuracy": 0.7307759299874306, + "num_tokens": 580523.0, + "step": 20 + }, + { + "entropy": 1.3000101447105408, + "epoch": 0.25, + "grad_norm": 0.2275390625, + "learning_rate": 1.969077286229078e-05, + "loss": 1.0626932382583618, + "mean_token_accuracy": 0.7271415144205093, + "num_tokens": 609771.0, + "step": 21 + }, + { + "entropy": 1.242678239941597, + "epoch": 0.2619047619047619, + "grad_norm": 0.2470703125, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.9827122092247009, + "mean_token_accuracy": 0.7448626458644867, + "num_tokens": 639104.0, + "step": 22 + }, + { + "entropy": 1.2583424746990204, + "epoch": 0.27380952380952384, + "grad_norm": 0.228515625, + "learning_rate": 1.962624246950012e-05, + "loss": 1.0062870979309082, + "mean_token_accuracy": 0.7375933676958084, + "num_tokens": 667792.0, + "step": 23 + }, + { + "entropy": 1.2531014680862427, + "epoch": 0.2857142857142857, + "grad_norm": 0.2294921875, + "learning_rate": 1.9591730613269878e-05, + "loss": 1.0229589939117432, + "mean_token_accuracy": 0.7366377785801888, + "num_tokens": 696742.0, + "step": 24 + }, + { + "entropy": 1.2342166602611542, + "epoch": 0.2976190476190476, + "grad_norm": 0.2333984375, + "learning_rate": 1.955572805786141e-05, + "loss": 0.9788997769355774, + "mean_token_accuracy": 0.7421486154198647, + "num_tokens": 725968.0, + "step": 25 + }, + { + "entropy": 1.2210585623979568, + "epoch": 0.30952380952380953, + "grad_norm": 0.2578125, + "learning_rate": 1.9518240398613226e-05, + "loss": 0.987277090549469, + "mean_token_accuracy": 0.7420973554253578, + "num_tokens": 755689.0, + "step": 26 + }, + { + "entropy": 1.24309404194355, + "epoch": 0.32142857142857145, + "grad_norm": 0.2421875, + "learning_rate": 1.947927346167132e-05, + "loss": 1.0301053524017334, + "mean_token_accuracy": 0.7300752699375153, + "num_tokens": 784977.0, + "step": 27 + }, + { + "entropy": 1.2028213143348694, + "epoch": 0.3333333333333333, + "grad_norm": 0.2177734375, + "learning_rate": 1.9438833303083677e-05, + "loss": 0.9393562078475952, + "mean_token_accuracy": 0.7491495907306671, + "num_tokens": 814048.0, + "step": 28 + }, + { + "entropy": 1.2287103980779648, + "epoch": 0.34523809523809523, + "grad_norm": 0.228515625, + "learning_rate": 1.9396926207859085e-05, + "loss": 1.0168366432189941, + "mean_token_accuracy": 0.7329602986574173, + "num_tokens": 843602.0, + "step": 29 + }, + { + "entropy": 1.2081626951694489, + "epoch": 0.35714285714285715, + "grad_norm": 0.2275390625, + "learning_rate": 1.935355868899034e-05, + "loss": 0.958310604095459, + "mean_token_accuracy": 0.7456908002495766, + "num_tokens": 871915.0, + "step": 30 + }, + { + "entropy": 1.2221457809209824, + "epoch": 0.36904761904761907, + "grad_norm": 0.2294921875, + "learning_rate": 1.9308737486442045e-05, + "loss": 0.9946644902229309, + "mean_token_accuracy": 0.7383344992995262, + "num_tokens": 900851.0, + "step": 31 + }, + { + "entropy": 1.1801428943872452, + "epoch": 0.38095238095238093, + "grad_norm": 0.21484375, + "learning_rate": 1.926246956610309e-05, + "loss": 0.9103766083717346, + "mean_token_accuracy": 0.7624464929103851, + "num_tokens": 929498.0, + "step": 32 + }, + { + "entropy": 1.2152698189020157, + "epoch": 0.39285714285714285, + "grad_norm": 0.2333984375, + "learning_rate": 1.921476211870408e-05, + "loss": 0.9737407565116882, + "mean_token_accuracy": 0.7427262291312218, + "num_tokens": 958933.0, + "step": 33 + }, + { + "entropy": 1.2030568569898605, + "epoch": 0.40476190476190477, + "grad_norm": 0.22265625, + "learning_rate": 1.9165622558699763e-05, + "loss": 0.9593278169631958, + "mean_token_accuracy": 0.7506603300571442, + "num_tokens": 987731.0, + "step": 34 + }, + { + "entropy": 1.1957021951675415, + "epoch": 0.4166666666666667, + "grad_norm": 0.2158203125, + "learning_rate": 1.9115058523116734e-05, + "loss": 0.9239043593406677, + "mean_token_accuracy": 0.7555749863386154, + "num_tokens": 1017002.0, + "step": 35 + }, + { + "entropy": 1.2133885324001312, + "epoch": 0.42857142857142855, + "grad_norm": 0.216796875, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.9809866547584534, + "mean_token_accuracy": 0.7437998279929161, + "num_tokens": 1046678.0, + "step": 36 + }, + { + "entropy": 1.2098581492900848, + "epoch": 0.44047619047619047, + "grad_norm": 0.2236328125, + "learning_rate": 1.900968867902419e-05, + "loss": 0.938984215259552, + "mean_token_accuracy": 0.7494841367006302, + "num_tokens": 1074445.0, + "step": 37 + }, + { + "entropy": 1.1815967112779617, + "epoch": 0.4523809523809524, + "grad_norm": 0.236328125, + "learning_rate": 1.895489924657301e-05, + "loss": 0.8934326767921448, + "mean_token_accuracy": 0.7595476359128952, + "num_tokens": 1103620.0, + "step": 38 + }, + { + "entropy": 1.2028009444475174, + "epoch": 0.4642857142857143, + "grad_norm": 0.2265625, + "learning_rate": 1.8898718088114688e-05, + "loss": 0.922984778881073, + "mean_token_accuracy": 0.7540801167488098, + "num_tokens": 1132637.0, + "step": 39 + }, + { + "entropy": 1.2034422308206558, + "epoch": 0.47619047619047616, + "grad_norm": 1.171875, + "learning_rate": 1.8841153935046098e-05, + "loss": 0.9033240675926208, + "mean_token_accuracy": 0.7560576424002647, + "num_tokens": 1161527.0, + "step": 40 + }, + { + "entropy": 1.1716476827859879, + "epoch": 0.4880952380952381, + "grad_norm": 0.2138671875, + "learning_rate": 1.8782215733702286e-05, + "loss": 0.8880018591880798, + "mean_token_accuracy": 0.7613470479846001, + "num_tokens": 1190701.0, + "step": 41 + }, + { + "entropy": 1.2157341986894608, + "epoch": 0.5, + "grad_norm": 0.2314453125, + "learning_rate": 1.8721912643966055e-05, + "loss": 0.9609653949737549, + "mean_token_accuracy": 0.7453824803233147, + "num_tokens": 1218835.0, + "step": 42 + }, + { + "entropy": 1.197568565607071, + "epoch": 0.5119047619047619, + "grad_norm": 0.216796875, + "learning_rate": 1.866025403784439e-05, + "loss": 0.9219189882278442, + "mean_token_accuracy": 0.7547592371702194, + "num_tokens": 1248679.0, + "step": 43 + }, + { + "entropy": 1.1708803623914719, + "epoch": 0.5238095238095238, + "grad_norm": 0.373046875, + "learning_rate": 1.8597249498011906e-05, + "loss": 0.8802202343940735, + "mean_token_accuracy": 0.7667126134037971, + "num_tokens": 1277106.0, + "step": 44 + }, + { + "entropy": 1.191767856478691, + "epoch": 0.5357142857142857, + "grad_norm": 0.2197265625, + "learning_rate": 1.8532908816321557e-05, + "loss": 0.9313769936561584, + "mean_token_accuracy": 0.7529165670275688, + "num_tokens": 1305983.0, + "step": 45 + }, + { + "entropy": 1.2066084146499634, + "epoch": 0.5476190476190477, + "grad_norm": 0.2255859375, + "learning_rate": 1.8467241992282842e-05, + "loss": 0.9347527027130127, + "mean_token_accuracy": 0.7446473762392998, + "num_tokens": 1334578.0, + "step": 46 + }, + { + "entropy": 1.177584484219551, + "epoch": 0.5595238095238095, + "grad_norm": 0.25, + "learning_rate": 1.8400259231507716e-05, + "loss": 0.8884726166725159, + "mean_token_accuracy": 0.7611024901270866, + "num_tokens": 1362873.0, + "step": 47 + }, + { + "entropy": 1.1629594564437866, + "epoch": 0.5714285714285714, + "grad_norm": 0.2265625, + "learning_rate": 1.833197094412449e-05, + "loss": 0.8861435651779175, + "mean_token_accuracy": 0.76307063549757, + "num_tokens": 1391315.0, + "step": 48 + }, + { + "entropy": 1.168922871351242, + "epoch": 0.5833333333333334, + "grad_norm": 0.23046875, + "learning_rate": 1.826238774315995e-05, + "loss": 0.8765286207199097, + "mean_token_accuracy": 0.76119015365839, + "num_tokens": 1419829.0, + "step": 49 + }, + { + "entropy": 1.1843004375696182, + "epoch": 0.5952380952380952, + "grad_norm": 0.234375, + "learning_rate": 1.819152044288992e-05, + "loss": 0.9242440462112427, + "mean_token_accuracy": 0.7494527697563171, + "num_tokens": 1447790.0, + "step": 50 + }, + { + "entropy": 1.1673331260681152, + "epoch": 0.6071428571428571, + "grad_norm": 0.2490234375, + "learning_rate": 1.811938005715857e-05, + "loss": 0.8822228312492371, + "mean_token_accuracy": 0.7585421577095985, + "num_tokens": 1476278.0, + "step": 51 + }, + { + "entropy": 1.2116869688034058, + "epoch": 0.6190476190476191, + "grad_norm": 0.2421875, + "learning_rate": 1.8045977797666685e-05, + "loss": 0.9784308671951294, + "mean_token_accuracy": 0.7404012456536293, + "num_tokens": 1503947.0, + "step": 52 + }, + { + "entropy": 1.162365809082985, + "epoch": 0.6309523809523809, + "grad_norm": 0.2265625, + "learning_rate": 1.7971325072229227e-05, + "loss": 0.9283543825149536, + "mean_token_accuracy": 0.7499738857150078, + "num_tokens": 1533531.0, + "step": 53 + }, + { + "entropy": 1.1863622218370438, + "epoch": 0.6428571428571429, + "grad_norm": 0.2421875, + "learning_rate": 1.7895433483002356e-05, + "loss": 0.9471738934516907, + "mean_token_accuracy": 0.7532860413193703, + "num_tokens": 1561412.0, + "step": 54 + }, + { + "entropy": 1.1698070168495178, + "epoch": 0.6547619047619048, + "grad_norm": 0.2255859375, + "learning_rate": 1.78183148246803e-05, + "loss": 0.9019606709480286, + "mean_token_accuracy": 0.7543124184012413, + "num_tokens": 1590336.0, + "step": 55 + }, + { + "entropy": 1.1683688312768936, + "epoch": 0.6666666666666666, + "grad_norm": 0.208984375, + "learning_rate": 1.7739981082662275e-05, + "loss": 0.9020405411720276, + "mean_token_accuracy": 0.7580606490373611, + "num_tokens": 1620442.0, + "step": 56 + }, + { + "entropy": 1.1867523938417435, + "epoch": 0.6785714285714286, + "grad_norm": 0.216796875, + "learning_rate": 1.766044443118978e-05, + "loss": 0.917300283908844, + "mean_token_accuracy": 0.7553394213318825, + "num_tokens": 1648762.0, + "step": 57 + }, + { + "entropy": 1.1505564451217651, + "epoch": 0.6904761904761905, + "grad_norm": 0.2216796875, + "learning_rate": 1.757971723145453e-05, + "loss": 0.8627029061317444, + "mean_token_accuracy": 0.7657916098833084, + "num_tokens": 1677464.0, + "step": 58 + }, + { + "entropy": 1.1766629666090012, + "epoch": 0.7023809523809523, + "grad_norm": 0.2236328125, + "learning_rate": 1.7497812029677344e-05, + "loss": 0.8795939087867737, + "mean_token_accuracy": 0.7613174989819527, + "num_tokens": 1704994.0, + "step": 59 + }, + { + "entropy": 1.1731744706630707, + "epoch": 0.7142857142857143, + "grad_norm": 0.2158203125, + "learning_rate": 1.741474155515827e-05, + "loss": 0.8988810777664185, + "mean_token_accuracy": 0.7579676881432533, + "num_tokens": 1734202.0, + "step": 60 + }, + { + "entropy": 1.1697156727313995, + "epoch": 0.7261904761904762, + "grad_norm": 0.2255859375, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.9070097804069519, + "mean_token_accuracy": 0.7564781159162521, + "num_tokens": 1763541.0, + "step": 61 + }, + { + "entropy": 1.1686383485794067, + "epoch": 0.7380952380952381, + "grad_norm": 0.2177734375, + "learning_rate": 1.7245156608592727e-05, + "loss": 0.8804867267608643, + "mean_token_accuracy": 0.7639917582273483, + "num_tokens": 1793196.0, + "step": 62 + }, + { + "entropy": 1.195967510342598, + "epoch": 0.75, + "grad_norm": 0.25390625, + "learning_rate": 1.7158668492597186e-05, + "loss": 0.9389015436172485, + "mean_token_accuracy": 0.747251845896244, + "num_tokens": 1821023.0, + "step": 63 + }, + { + "entropy": 1.1664810329675674, + "epoch": 0.7619047619047619, + "grad_norm": 0.21875, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.9056146740913391, + "mean_token_accuracy": 0.7550350353121758, + "num_tokens": 1849586.0, + "step": 64 + }, + { + "entropy": 1.171183928847313, + "epoch": 0.7738095238095238, + "grad_norm": 0.2177734375, + "learning_rate": 1.698236818086073e-05, + "loss": 0.929341197013855, + "mean_token_accuracy": 0.7491638883948326, + "num_tokens": 1878622.0, + "step": 65 + }, + { + "entropy": 1.1465008854866028, + "epoch": 0.7857142857142857, + "grad_norm": 0.2177734375, + "learning_rate": 1.689258338483947e-05, + "loss": 0.8692110776901245, + "mean_token_accuracy": 0.765314869582653, + "num_tokens": 1907725.0, + "step": 66 + }, + { + "entropy": 1.1706128865480423, + "epoch": 0.7976190476190477, + "grad_norm": 0.216796875, + "learning_rate": 1.6801727377709195e-05, + "loss": 0.886278510093689, + "mean_token_accuracy": 0.7576193287968636, + "num_tokens": 1936209.0, + "step": 67 + }, + { + "entropy": 1.1479064524173737, + "epoch": 0.8095238095238095, + "grad_norm": 0.2216796875, + "learning_rate": 1.67098142798597e-05, + "loss": 0.8587610125541687, + "mean_token_accuracy": 0.7682890966534615, + "num_tokens": 1964915.0, + "step": 68 + }, + { + "entropy": 1.1495172083377838, + "epoch": 0.8214285714285714, + "grad_norm": 0.2294921875, + "learning_rate": 1.6616858375968596e-05, + "loss": 0.8885282874107361, + "mean_token_accuracy": 0.7598370909690857, + "num_tokens": 1993606.0, + "step": 69 + }, + { + "entropy": 1.1534761786460876, + "epoch": 0.8333333333333334, + "grad_norm": 0.2138671875, + "learning_rate": 1.6522874112781213e-05, + "loss": 0.8863916993141174, + "mean_token_accuracy": 0.7640347108244896, + "num_tokens": 2022472.0, + "step": 70 + }, + { + "entropy": 1.14171202480793, + "epoch": 0.8452380952380952, + "grad_norm": 0.220703125, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.8785849809646606, + "mean_token_accuracy": 0.7604316994547844, + "num_tokens": 2052746.0, + "step": 71 + }, + { + "entropy": 1.1478676050901413, + "epoch": 0.8571428571428571, + "grad_norm": 0.212890625, + "learning_rate": 1.6331879092341402e-05, + "loss": 0.8796285390853882, + "mean_token_accuracy": 0.7586944848299026, + "num_tokens": 2081889.0, + "step": 72 + }, + { + "entropy": 1.1222540885210037, + "epoch": 0.8690476190476191, + "grad_norm": 0.193359375, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.8146858811378479, + "mean_token_accuracy": 0.7756616845726967, + "num_tokens": 2111616.0, + "step": 73 + }, + { + "entropy": 1.153001144528389, + "epoch": 0.8809523809523809, + "grad_norm": 0.224609375, + "learning_rate": 1.6136947947920477e-05, + "loss": 0.8884707689285278, + "mean_token_accuracy": 0.7565625682473183, + "num_tokens": 2140433.0, + "step": 74 + }, + { + "entropy": 1.1275182217359543, + "epoch": 0.8928571428571429, + "grad_norm": 0.2099609375, + "learning_rate": 1.6038044103254775e-05, + "loss": 0.8272450566291809, + "mean_token_accuracy": 0.7704622000455856, + "num_tokens": 2170414.0, + "step": 75 + }, + { + "entropy": 1.1576026529073715, + "epoch": 0.9047619047619048, + "grad_norm": 0.22265625, + "learning_rate": 1.5938201855735017e-05, + "loss": 0.9035623669624329, + "mean_token_accuracy": 0.7542874589562416, + "num_tokens": 2198868.0, + "step": 76 + }, + { + "entropy": 1.1199318170547485, + "epoch": 0.9166666666666666, + "grad_norm": 0.20703125, + "learning_rate": 1.5837436722347902e-05, + "loss": 0.8039325475692749, + "mean_token_accuracy": 0.783287987112999, + "num_tokens": 2228134.0, + "step": 77 + }, + { + "entropy": 1.1484037339687347, + "epoch": 0.9285714285714286, + "grad_norm": 0.2138671875, + "learning_rate": 1.573576436351046e-05, + "loss": 0.8699290752410889, + "mean_token_accuracy": 0.7641323357820511, + "num_tokens": 2257447.0, + "step": 78 + }, + { + "entropy": 1.1295416802167892, + "epoch": 0.9404761904761905, + "grad_norm": 0.205078125, + "learning_rate": 1.563320058063622e-05, + "loss": 0.8303874731063843, + "mean_token_accuracy": 0.7720286920666695, + "num_tokens": 2286749.0, + "step": 79 + }, + { + "entropy": 1.1563286185264587, + "epoch": 0.9523809523809523, + "grad_norm": 0.21875, + "learning_rate": 1.5529761313679396e-05, + "loss": 0.8524646759033203, + "mean_token_accuracy": 0.7633371129631996, + "num_tokens": 2315039.0, + "step": 80 + }, + { + "entropy": 1.1543449014425278, + "epoch": 0.9642857142857143, + "grad_norm": 0.2158203125, + "learning_rate": 1.5425462638657597e-05, + "loss": 0.9120794534683228, + "mean_token_accuracy": 0.756316527724266, + "num_tokens": 2344737.0, + "step": 81 + }, + { + "entropy": 1.13828843832016, + "epoch": 0.9761904761904762, + "grad_norm": 0.2265625, + "learning_rate": 1.5320320765153367e-05, + "loss": 0.824118971824646, + "mean_token_accuracy": 0.7736462280154228, + "num_tokens": 2373710.0, + "step": 82 + }, + { + "entropy": 1.145560473203659, + "epoch": 0.9880952380952381, + "grad_norm": 0.220703125, + "learning_rate": 1.5214352033794981e-05, + "loss": 0.8729808926582336, + "mean_token_accuracy": 0.7629412487149239, + "num_tokens": 2402610.0, + "step": 83 + }, + { + "entropy": 1.1476428806781769, + "epoch": 1.0, + "grad_norm": 0.22265625, + "learning_rate": 1.5107572913716859e-05, + "loss": 0.8972144722938538, + "mean_token_accuracy": 0.757901057600975, + "num_tokens": 2430019.0, + "step": 84 + }, + { + "epoch": 1.0, + "eval_entropy": 1.1429666471481323, + "eval_loss": 0.8658801317214966, + "eval_mean_token_accuracy": 0.7630383356412251, + "eval_model_preparation_time": 0.0051, + "eval_num_tokens": 2430019.0, + "eval_runtime": 19.169, + "eval_samples_per_second": 7.825, + "eval_steps_per_second": 7.825, + "step": 84 + }, + { + "entropy": 1.1193113178014755, + "epoch": 1.0119047619047619, + "grad_norm": 0.205078125, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8069751858711243, + "mean_token_accuracy": 0.7752386555075645, + "num_tokens": 2459653.0, + "step": 85 + }, + { + "entropy": 1.14054836332798, + "epoch": 1.0238095238095237, + "grad_norm": 0.2109375, + "learning_rate": 1.4891650011092896e-05, + "loss": 0.8288445472717285, + "mean_token_accuracy": 0.7729767188429832, + "num_tokens": 2488217.0, + "step": 86 + }, + { + "entropy": 1.1414664089679718, + "epoch": 1.0357142857142858, + "grad_norm": 0.2158203125, + "learning_rate": 1.4782539786213184e-05, + "loss": 0.8254880905151367, + "mean_token_accuracy": 0.7727913111448288, + "num_tokens": 2517578.0, + "step": 87 + }, + { + "entropy": 1.1179616451263428, + "epoch": 1.0476190476190477, + "grad_norm": 0.205078125, + "learning_rate": 1.4672686282730622e-05, + "loss": 0.8098872303962708, + "mean_token_accuracy": 0.7769448384642601, + "num_tokens": 2546116.0, + "step": 88 + }, + { + "entropy": 1.1239117681980133, + "epoch": 1.0595238095238095, + "grad_norm": 0.318359375, + "learning_rate": 1.4562106573531632e-05, + "loss": 0.8263017535209656, + "mean_token_accuracy": 0.7758133932948112, + "num_tokens": 2574681.0, + "step": 89 + }, + { + "entropy": 1.1026111543178558, + "epoch": 1.0714285714285714, + "grad_norm": 0.2080078125, + "learning_rate": 1.4450817844365924e-05, + "loss": 0.8099116086959839, + "mean_token_accuracy": 0.7731629684567451, + "num_tokens": 2603807.0, + "step": 90 + }, + { + "entropy": 1.1024491339921951, + "epoch": 1.0833333333333333, + "grad_norm": 0.2158203125, + "learning_rate": 1.4338837391175582e-05, + "loss": 0.8093633055686951, + "mean_token_accuracy": 0.7739714533090591, + "num_tokens": 2632614.0, + "step": 91 + }, + { + "entropy": 1.1085499972105026, + "epoch": 1.0952380952380953, + "grad_norm": 0.216796875, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.8473532199859619, + "mean_token_accuracy": 0.7683232203125954, + "num_tokens": 2661538.0, + "step": 92 + }, + { + "entropy": 1.0892803370952606, + "epoch": 1.1071428571428572, + "grad_norm": 0.220703125, + "learning_rate": 1.4112871031306118e-05, + "loss": 0.8294469118118286, + "mean_token_accuracy": 0.7713945508003235, + "num_tokens": 2690777.0, + "step": 93 + }, + { + "entropy": 1.1031535863876343, + "epoch": 1.119047619047619, + "grad_norm": 0.224609375, + "learning_rate": 1.3998920243197408e-05, + "loss": 0.8391809463500977, + "mean_token_accuracy": 0.7676805257797241, + "num_tokens": 2719730.0, + "step": 94 + }, + { + "entropy": 1.0815589874982834, + "epoch": 1.130952380952381, + "grad_norm": 0.21484375, + "learning_rate": 1.3884347962746949e-05, + "loss": 0.7862935066223145, + "mean_token_accuracy": 0.7806214541196823, + "num_tokens": 2749156.0, + "step": 95 + }, + { + "entropy": 1.084671527147293, + "epoch": 1.1428571428571428, + "grad_norm": 0.2197265625, + "learning_rate": 1.3769171996210053e-05, + "loss": 0.840523898601532, + "mean_token_accuracy": 0.7695459797978401, + "num_tokens": 2778531.0, + "step": 96 + }, + { + "entropy": 1.0894652903079987, + "epoch": 1.1547619047619047, + "grad_norm": 0.216796875, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.7974240779876709, + "mean_token_accuracy": 0.7744667157530785, + "num_tokens": 2806462.0, + "step": 97 + }, + { + "entropy": 1.0971969813108444, + "epoch": 1.1666666666666667, + "grad_norm": 0.22265625, + "learning_rate": 1.3537080696225815e-05, + "loss": 0.8246796131134033, + "mean_token_accuracy": 0.7684177905321121, + "num_tokens": 2835497.0, + "step": 98 + }, + { + "entropy": 1.1123791635036469, + "epoch": 1.1785714285714286, + "grad_norm": 0.2373046875, + "learning_rate": 1.342020143325669e-05, + "loss": 0.8859103322029114, + "mean_token_accuracy": 0.7534352988004684, + "num_tokens": 2865231.0, + "step": 99 + }, + { + "entropy": 1.075607344508171, + "epoch": 1.1904761904761905, + "grad_norm": 0.2265625, + "learning_rate": 1.3302790619551673e-05, + "loss": 0.7980949878692627, + "mean_token_accuracy": 0.7762870118021965, + "num_tokens": 2894329.0, + "step": 100 + }, + { + "entropy": 1.1072215735912323, + "epoch": 1.2023809523809523, + "grad_norm": 0.2353515625, + "learning_rate": 1.3184866502516846e-05, + "loss": 0.8650733232498169, + "mean_token_accuracy": 0.764843761920929, + "num_tokens": 2923660.0, + "step": 101 + }, + { + "entropy": 1.0887151509523392, + "epoch": 1.2142857142857142, + "grad_norm": 0.2255859375, + "learning_rate": 1.3066447409333345e-05, + "loss": 0.790311336517334, + "mean_token_accuracy": 0.7792445793747902, + "num_tokens": 2952054.0, + "step": 102 + }, + { + "entropy": 1.1025346666574478, + "epoch": 1.2261904761904763, + "grad_norm": 0.2392578125, + "learning_rate": 1.2947551744109044e-05, + "loss": 0.8180376887321472, + "mean_token_accuracy": 0.7729773372411728, + "num_tokens": 2981426.0, + "step": 103 + }, + { + "entropy": 1.0916212499141693, + "epoch": 1.2380952380952381, + "grad_norm": 0.2373046875, + "learning_rate": 1.2828197985018276e-05, + "loss": 0.7971659898757935, + "mean_token_accuracy": 0.7799450904130936, + "num_tokens": 3009579.0, + "step": 104 + }, + { + "entropy": 1.1104163080453873, + "epoch": 1.25, + "grad_norm": 0.2333984375, + "learning_rate": 1.2708404681430054e-05, + "loss": 0.8455361127853394, + "mean_token_accuracy": 0.7681760489940643, + "num_tokens": 3038292.0, + "step": 105 + }, + { + "entropy": 1.1180581152439117, + "epoch": 1.2619047619047619, + "grad_norm": 0.2431640625, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.8946309685707092, + "mean_token_accuracy": 0.755538322031498, + "num_tokens": 3068231.0, + "step": 106 + }, + { + "entropy": 1.0994994044303894, + "epoch": 1.2738095238095237, + "grad_norm": 0.265625, + "learning_rate": 1.2467573976902936e-05, + "loss": 0.7855837345123291, + "mean_token_accuracy": 0.7798345908522606, + "num_tokens": 3096640.0, + "step": 107 + }, + { + "entropy": 1.0958448350429535, + "epoch": 1.2857142857142856, + "grad_norm": 0.22265625, + "learning_rate": 1.2346574004677154e-05, + "loss": 0.8080664277076721, + "mean_token_accuracy": 0.775592751801014, + "num_tokens": 3125619.0, + "step": 108 + }, + { + "entropy": 1.1057351678609848, + "epoch": 1.2976190476190477, + "grad_norm": 0.375, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.8222600817680359, + "mean_token_accuracy": 0.7683183401823044, + "num_tokens": 3155256.0, + "step": 109 + }, + { + "entropy": 1.1132191121578217, + "epoch": 1.3095238095238095, + "grad_norm": 0.2197265625, + "learning_rate": 1.210349884345496e-05, + "loss": 0.8248376250267029, + "mean_token_accuracy": 0.7687205746769905, + "num_tokens": 3183948.0, + "step": 110 + }, + { + "entropy": 1.0987165123224258, + "epoch": 1.3214285714285714, + "grad_norm": 0.31640625, + "learning_rate": 1.1981461431993978e-05, + "loss": 0.8191619515419006, + "mean_token_accuracy": 0.772399052977562, + "num_tokens": 3212463.0, + "step": 111 + }, + { + "entropy": 1.1073571592569351, + "epoch": 1.3333333333333333, + "grad_norm": 0.232421875, + "learning_rate": 1.1859116071629148e-05, + "loss": 0.8318334221839905, + "mean_token_accuracy": 0.7649757117033005, + "num_tokens": 3241487.0, + "step": 112 + }, + { + "entropy": 1.102282091975212, + "epoch": 1.3452380952380953, + "grad_norm": 0.2197265625, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.8375995755195618, + "mean_token_accuracy": 0.7682436108589172, + "num_tokens": 3270436.0, + "step": 113 + }, + { + "entropy": 1.0837299078702927, + "epoch": 1.3571428571428572, + "grad_norm": 0.2158203125, + "learning_rate": 1.1613577606328068e-05, + "loss": 0.7833430767059326, + "mean_token_accuracy": 0.7823601812124252, + "num_tokens": 3299814.0, + "step": 114 + }, + { + "entropy": 1.0879952907562256, + "epoch": 1.369047619047619, + "grad_norm": 0.2119140625, + "learning_rate": 1.1490422661761744e-05, + "loss": 0.7993915677070618, + "mean_token_accuracy": 0.7771986275911331, + "num_tokens": 3328509.0, + "step": 115 + }, + { + "entropy": 1.112231805920601, + "epoch": 1.380952380952381, + "grad_norm": 0.224609375, + "learning_rate": 1.1367036083100735e-05, + "loss": 0.8307598233222961, + "mean_token_accuracy": 0.7695401236414909, + "num_tokens": 3356953.0, + "step": 116 + }, + { + "entropy": 1.0973141938447952, + "epoch": 1.3928571428571428, + "grad_norm": 0.2314453125, + "learning_rate": 1.1243437046474854e-05, + "loss": 0.8001049757003784, + "mean_token_accuracy": 0.7750882878899574, + "num_tokens": 3385659.0, + "step": 117 + }, + { + "entropy": 1.1106764674186707, + "epoch": 1.4047619047619047, + "grad_norm": 0.228515625, + "learning_rate": 1.1119644761033079e-05, + "loss": 0.820791482925415, + "mean_token_accuracy": 0.7748995646834373, + "num_tokens": 3414046.0, + "step": 118 + }, + { + "entropy": 1.0989094227552414, + "epoch": 1.4166666666666667, + "grad_norm": 0.26171875, + "learning_rate": 1.0995678465958168e-05, + "loss": 0.8132579326629639, + "mean_token_accuracy": 0.7685609012842178, + "num_tokens": 3442153.0, + "step": 119 + }, + { + "entropy": 1.123728185892105, + "epoch": 1.4285714285714286, + "grad_norm": 0.2265625, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.8655298948287964, + "mean_token_accuracy": 0.7622044086456299, + "num_tokens": 3471630.0, + "step": 120 + }, + { + "entropy": 1.1011102497577667, + "epoch": 1.4404761904761905, + "grad_norm": 0.21875, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.8160438537597656, + "mean_token_accuracy": 0.7715617045760155, + "num_tokens": 3500341.0, + "step": 121 + }, + { + "entropy": 1.0802496522665024, + "epoch": 1.4523809523809523, + "grad_norm": 0.22265625, + "learning_rate": 1.0622928302448523e-05, + "loss": 0.795846700668335, + "mean_token_accuracy": 0.7745838463306427, + "num_tokens": 3530737.0, + "step": 122 + }, + { + "entropy": 1.111521065235138, + "epoch": 1.4642857142857144, + "grad_norm": 0.2275390625, + "learning_rate": 1.0498458856606972e-05, + "loss": 0.8259180188179016, + "mean_token_accuracy": 0.7704022750258446, + "num_tokens": 3559280.0, + "step": 123 + }, + { + "entropy": 1.1052347421646118, + "epoch": 1.4761904761904763, + "grad_norm": 0.240234375, + "learning_rate": 1.037391194276326e-05, + "loss": 0.8490574359893799, + "mean_token_accuracy": 0.7630502879619598, + "num_tokens": 3588223.0, + "step": 124 + }, + { + "entropy": 1.1114005744457245, + "epoch": 1.4880952380952381, + "grad_norm": 0.2255859375, + "learning_rate": 1.0249306917380731e-05, + "loss": 0.8460506796836853, + "mean_token_accuracy": 0.766345664858818, + "num_tokens": 3617347.0, + "step": 125 + }, + { + "entropy": 1.115243524312973, + "epoch": 1.5, + "grad_norm": 0.21484375, + "learning_rate": 1.0124663145954152e-05, + "loss": 0.8421509265899658, + "mean_token_accuracy": 0.7646084725856781, + "num_tokens": 3646452.0, + "step": 126 + }, + { + "entropy": 1.1102195531129837, + "epoch": 1.5119047619047619, + "grad_norm": 0.68359375, + "learning_rate": 1e-05, + "loss": 0.8465963006019592, + "mean_token_accuracy": 0.7651297971606255, + "num_tokens": 3674684.0, + "step": 127 + }, + { + "entropy": 1.1138557642698288, + "epoch": 1.5238095238095237, + "grad_norm": 0.2392578125, + "learning_rate": 9.87533685404585e-06, + "loss": 0.8462578058242798, + "mean_token_accuracy": 0.7656397670507431, + "num_tokens": 3701972.0, + "step": 128 + }, + { + "entropy": 1.0684314519166946, + "epoch": 1.5357142857142856, + "grad_norm": 0.2294921875, + "learning_rate": 9.750693082619274e-06, + "loss": 0.7849152684211731, + "mean_token_accuracy": 0.7857328802347183, + "num_tokens": 3731223.0, + "step": 129 + }, + { + "entropy": 1.0986532717943192, + "epoch": 1.5476190476190477, + "grad_norm": 0.234375, + "learning_rate": 9.626088057236745e-06, + "loss": 0.8162216544151306, + "mean_token_accuracy": 0.7728657871484756, + "num_tokens": 3759466.0, + "step": 130 + }, + { + "entropy": 1.1011187136173248, + "epoch": 1.5595238095238095, + "grad_norm": 0.220703125, + "learning_rate": 9.501541143393028e-06, + "loss": 0.8209044933319092, + "mean_token_accuracy": 0.7711444199085236, + "num_tokens": 3788276.0, + "step": 131 + }, + { + "entropy": 1.0799630433321, + "epoch": 1.5714285714285714, + "grad_norm": 0.21875, + "learning_rate": 9.377071697551479e-06, + "loss": 0.7802744507789612, + "mean_token_accuracy": 0.7825465202331543, + "num_tokens": 3817834.0, + "step": 132 + }, + { + "entropy": 1.1000354290008545, + "epoch": 1.5833333333333335, + "grad_norm": 0.21875, + "learning_rate": 9.252699064135759e-06, + "loss": 0.8035217523574829, + "mean_token_accuracy": 0.7786530405282974, + "num_tokens": 3846803.0, + "step": 133 + }, + { + "entropy": 1.106198564171791, + "epoch": 1.5952380952380953, + "grad_norm": 0.2119140625, + "learning_rate": 9.128442572523418e-06, + "loss": 0.8161381483078003, + "mean_token_accuracy": 0.7741018161177635, + "num_tokens": 3875363.0, + "step": 134 + }, + { + "entropy": 1.094715103507042, + "epoch": 1.6071428571428572, + "grad_norm": 0.2236328125, + "learning_rate": 9.004321534041836e-06, + "loss": 0.797153115272522, + "mean_token_accuracy": 0.7743495553731918, + "num_tokens": 3904020.0, + "step": 135 + }, + { + "entropy": 1.0934260189533234, + "epoch": 1.619047619047619, + "grad_norm": 0.2236328125, + "learning_rate": 8.880355238966923e-06, + "loss": 0.7957767248153687, + "mean_token_accuracy": 0.7722097188234329, + "num_tokens": 3932554.0, + "step": 136 + }, + { + "entropy": 1.0932885110378265, + "epoch": 1.630952380952381, + "grad_norm": 0.2216796875, + "learning_rate": 8.756562953525151e-06, + "loss": 0.8285123109817505, + "mean_token_accuracy": 0.7748213410377502, + "num_tokens": 3963124.0, + "step": 137 + }, + { + "entropy": 1.0973111540079117, + "epoch": 1.6428571428571428, + "grad_norm": 0.220703125, + "learning_rate": 8.632963916899268e-06, + "loss": 0.8037251234054565, + "mean_token_accuracy": 0.7732022255659103, + "num_tokens": 3991485.0, + "step": 138 + }, + { + "entropy": 1.1007558554410934, + "epoch": 1.6547619047619047, + "grad_norm": 0.22265625, + "learning_rate": 8.509577338238255e-06, + "loss": 0.8211590051651001, + "mean_token_accuracy": 0.770406000316143, + "num_tokens": 4020583.0, + "step": 139 + }, + { + "entropy": 1.0826598927378654, + "epoch": 1.6666666666666665, + "grad_norm": 0.2177734375, + "learning_rate": 8.386422393671934e-06, + "loss": 0.7706205248832703, + "mean_token_accuracy": 0.7830442562699318, + "num_tokens": 4049853.0, + "step": 140 + }, + { + "entropy": 1.1024836301803589, + "epoch": 1.6785714285714286, + "grad_norm": 0.2099609375, + "learning_rate": 8.263518223330698e-06, + "loss": 0.8168199062347412, + "mean_token_accuracy": 0.7706331759691238, + "num_tokens": 4079030.0, + "step": 141 + }, + { + "entropy": 1.1091957688331604, + "epoch": 1.6904761904761905, + "grad_norm": 0.2236328125, + "learning_rate": 8.140883928370855e-06, + "loss": 0.8526521325111389, + "mean_token_accuracy": 0.7632784247398376, + "num_tokens": 4108702.0, + "step": 142 + }, + { + "entropy": 1.0926142483949661, + "epoch": 1.7023809523809523, + "grad_norm": 0.251953125, + "learning_rate": 8.018538568006027e-06, + "loss": 0.800937294960022, + "mean_token_accuracy": 0.7739113718271255, + "num_tokens": 4138456.0, + "step": 143 + }, + { + "entropy": 1.085595116019249, + "epoch": 1.7142857142857144, + "grad_norm": 0.21875, + "learning_rate": 7.896501156545044e-06, + "loss": 0.7860180735588074, + "mean_token_accuracy": 0.7786939144134521, + "num_tokens": 4168706.0, + "step": 144 + }, + { + "entropy": 1.072287455201149, + "epoch": 1.7261904761904763, + "grad_norm": 0.2197265625, + "learning_rate": 7.774790660436857e-06, + "loss": 0.7674008011817932, + "mean_token_accuracy": 0.7878812775015831, + "num_tokens": 4197229.0, + "step": 145 + }, + { + "entropy": 1.1147316098213196, + "epoch": 1.7380952380952381, + "grad_norm": 0.21484375, + "learning_rate": 7.653425995322852e-06, + "loss": 0.8494656682014465, + "mean_token_accuracy": 0.7613426074385643, + "num_tokens": 4226241.0, + "step": 146 + }, + { + "entropy": 1.0884745866060257, + "epoch": 1.75, + "grad_norm": 0.2353515625, + "learning_rate": 7.532426023097063e-06, + "loss": 0.7670794129371643, + "mean_token_accuracy": 0.7854177579283714, + "num_tokens": 4254275.0, + "step": 147 + }, + { + "entropy": 1.0930557996034622, + "epoch": 1.7619047619047619, + "grad_norm": 0.208984375, + "learning_rate": 7.411809548974792e-06, + "loss": 0.8160566091537476, + "mean_token_accuracy": 0.7741377875208855, + "num_tokens": 4283150.0, + "step": 148 + }, + { + "entropy": 1.0953784435987473, + "epoch": 1.7738095238095237, + "grad_norm": 0.2236328125, + "learning_rate": 7.291595318569951e-06, + "loss": 0.8185824751853943, + "mean_token_accuracy": 0.7722392901778221, + "num_tokens": 4312410.0, + "step": 149 + }, + { + "entropy": 1.0579064786434174, + "epoch": 1.7857142857142856, + "grad_norm": 0.2099609375, + "learning_rate": 7.171802014981726e-06, + "loss": 0.748650848865509, + "mean_token_accuracy": 0.79205472022295, + "num_tokens": 4341254.0, + "step": 150 + }, + { + "entropy": 1.0844353437423706, + "epoch": 1.7976190476190477, + "grad_norm": 0.2197265625, + "learning_rate": 7.052448255890958e-06, + "loss": 0.7923084497451782, + "mean_token_accuracy": 0.7777365446090698, + "num_tokens": 4369573.0, + "step": 151 + }, + { + "entropy": 1.098150685429573, + "epoch": 1.8095238095238095, + "grad_norm": 0.23046875, + "learning_rate": 6.933552590666659e-06, + "loss": 0.8330479860305786, + "mean_token_accuracy": 0.7675687223672867, + "num_tokens": 4397753.0, + "step": 152 + }, + { + "entropy": 1.0822398364543915, + "epoch": 1.8214285714285714, + "grad_norm": 0.2119140625, + "learning_rate": 6.815133497483157e-06, + "loss": 0.7889379262924194, + "mean_token_accuracy": 0.7777184247970581, + "num_tokens": 4427257.0, + "step": 153 + }, + { + "entropy": 1.0754519402980804, + "epoch": 1.8333333333333335, + "grad_norm": 0.2236328125, + "learning_rate": 6.697209380448333e-06, + "loss": 0.784767746925354, + "mean_token_accuracy": 0.7786002233624458, + "num_tokens": 4456709.0, + "step": 154 + }, + { + "entropy": 1.084225744009018, + "epoch": 1.8452380952380953, + "grad_norm": 0.2177734375, + "learning_rate": 6.579798566743314e-06, + "loss": 0.8074082732200623, + "mean_token_accuracy": 0.7725819870829582, + "num_tokens": 4485653.0, + "step": 155 + }, + { + "entropy": 1.0734328627586365, + "epoch": 1.8571428571428572, + "grad_norm": 0.203125, + "learning_rate": 6.462919303774186e-06, + "loss": 0.7693166136741638, + "mean_token_accuracy": 0.7844012156128883, + "num_tokens": 4515131.0, + "step": 156 + }, + { + "entropy": 1.0944669842720032, + "epoch": 1.869047619047619, + "grad_norm": 0.224609375, + "learning_rate": 6.34658975633605e-06, + "loss": 0.8283172249794006, + "mean_token_accuracy": 0.7699304968118668, + "num_tokens": 4544554.0, + "step": 157 + }, + { + "entropy": 1.0710095912218094, + "epoch": 1.880952380952381, + "grad_norm": 0.20703125, + "learning_rate": 6.230828003789949e-06, + "loss": 0.7723422050476074, + "mean_token_accuracy": 0.785270169377327, + "num_tokens": 4574526.0, + "step": 158 + }, + { + "entropy": 1.1019357591867447, + "epoch": 1.8928571428571428, + "grad_norm": 0.2236328125, + "learning_rate": 6.115652037253054e-06, + "loss": 0.842171847820282, + "mean_token_accuracy": 0.7660646587610245, + "num_tokens": 4603221.0, + "step": 159 + }, + { + "entropy": 1.0798636227846146, + "epoch": 1.9047619047619047, + "grad_norm": 0.2138671875, + "learning_rate": 6.001079756802592e-06, + "loss": 0.7799994945526123, + "mean_token_accuracy": 0.7830873727798462, + "num_tokens": 4632086.0, + "step": 160 + }, + { + "entropy": 1.0717933773994446, + "epoch": 1.9166666666666665, + "grad_norm": 0.21875, + "learning_rate": 5.887128968693887e-06, + "loss": 0.7654195427894592, + "mean_token_accuracy": 0.7824560701847076, + "num_tokens": 4660316.0, + "step": 161 + }, + { + "entropy": 1.069181576371193, + "epoch": 1.9285714285714286, + "grad_norm": 0.2119140625, + "learning_rate": 5.773817382593008e-06, + "loss": 0.7898048162460327, + "mean_token_accuracy": 0.7769228145480156, + "num_tokens": 4689587.0, + "step": 162 + }, + { + "entropy": 1.0788903683423996, + "epoch": 1.9404761904761905, + "grad_norm": 0.212890625, + "learning_rate": 5.66116260882442e-06, + "loss": 0.7974780797958374, + "mean_token_accuracy": 0.7752274572849274, + "num_tokens": 4719335.0, + "step": 163 + }, + { + "entropy": 1.1007077991962433, + "epoch": 1.9523809523809523, + "grad_norm": 0.23046875, + "learning_rate": 5.549182155634076e-06, + "loss": 0.7892836332321167, + "mean_token_accuracy": 0.7779370620846748, + "num_tokens": 4746463.0, + "step": 164 + }, + { + "entropy": 1.0850374549627304, + "epoch": 1.9642857142857144, + "grad_norm": 0.2109375, + "learning_rate": 5.43789342646837e-06, + "loss": 0.7919931411743164, + "mean_token_accuracy": 0.7770635932683945, + "num_tokens": 4775141.0, + "step": 165 + }, + { + "entropy": 1.081614837050438, + "epoch": 1.9761904761904763, + "grad_norm": 0.21875, + "learning_rate": 5.32731371726938e-06, + "loss": 0.7762281894683838, + "mean_token_accuracy": 0.7814824879169464, + "num_tokens": 4803229.0, + "step": 166 + }, + { + "entropy": 1.086833968758583, + "epoch": 1.9880952380952381, + "grad_norm": 0.216796875, + "learning_rate": 5.217460213786822e-06, + "loss": 0.8244621157646179, + "mean_token_accuracy": 0.7730955481529236, + "num_tokens": 4832506.0, + "step": 167 + }, + { + "entropy": 1.0700944513082504, + "epoch": 2.0, + "grad_norm": 0.2216796875, + "learning_rate": 5.108349988907111e-06, + "loss": 0.7783507704734802, + "mean_token_accuracy": 0.7822717130184174, + "num_tokens": 4860038.0, + "step": 168 + }, + { + "epoch": 2.0, + "eval_entropy": 1.0883367625872293, + "eval_loss": 0.8387430906295776, + "eval_mean_token_accuracy": 0.7681301248073578, + "eval_model_preparation_time": 0.0051, + "eval_num_tokens": 4860038.0, + "eval_runtime": 19.5881, + "eval_samples_per_second": 7.658, + "eval_steps_per_second": 7.658, + "step": 168 + } + ], + "logging_steps": 1, + "max_steps": 252, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1131690390237286e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..a65364f --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df04f2387ceaaf0af4f50c3c27439b4b3b5bb4a366490e82fbcb5ddc98d615ef +size 5649