初始化项目，由ModelHub XC社区提供模型

Model: mremila/Llama-3.1-8B-coding Source: Original Platform
2026-04-25 21:02:20 +08:00
commit 39e6955dd1
25 changed files with 3126 additions and 0 deletions
--- a/checkpoint-556/chat_template.jinja
+++ b/checkpoint-556/chat_template.jinja
@@ -0,0 +1,109 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
--- a/checkpoint-556/config.json
+++ b/checkpoint-556/config.json
@@ -0,0 +1,36 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "dtype": "float32",
+  "eos_token_id": 128009,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": 128009,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "factor": 8.0,
+    "high_freq_factor": 4.0,
+    "low_freq_factor": 1.0,
+    "original_max_position_embeddings": 8192,
+    "rope_theta": 500000.0,
+    "rope_type": "llama3"
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.3.0",
+  "use_cache": false,
+  "vocab_size": 128256
+}
--- a/checkpoint-556/generation_config.json
+++ b/checkpoint-556/generation_config.json
@@ -0,0 +1,13 @@
+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "do_sample": true,
+  "eos_token_id": [
+    128009,
+    128001
+  ],
+  "pad_token_id": 128009,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "5.3.0"
+}
--- a/checkpoint-556/model.safetensors
+++ b/checkpoint-556/model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93b11399090fa2a1cdd668d929bf43250eadc83605451a6acc342987cca076bf
+size 32121079032
--- a/checkpoint-556/optimizer.bin
+++ b/checkpoint-556/optimizer.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09f21324b98a9e73906291c7f1387bce209d66fceae76985a053dcee2dcdf022
+size 64242369179
--- a/checkpoint-556/pytorch_model_fsdp.bin
+++ b/checkpoint-556/pytorch_model_fsdp.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e63c7c50ab8cc828f2ec1b1acd0b7537f5df2413f04d24aab3570df776c33d8
+size 32121192148
--- a/checkpoint-556/rng_state_0.pth
+++ b/checkpoint-556/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:659b1cdee2219458dd84ce6a632a595465680b8080e5c44bd600ff97eca8d752
+size 15429
--- a/checkpoint-556/rng_state_1.pth
+++ b/checkpoint-556/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86accf27064cdd503053e90476a6bd10de333d4ff0594535ad55ea13a473c91d
+size 15429
--- a/checkpoint-556/rng_state_2.pth
+++ b/checkpoint-556/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18ca8d714ef40be035404c1957b5a4dee84e1f43980408393f8aa710552ee6f6
+size 15429
--- a/checkpoint-556/rng_state_3.pth
+++ b/checkpoint-556/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2cfdebe99e40accc9c9d8f09c63136a14abda997d9b501969ec8e16e9d183179
+size 15429
--- a/checkpoint-556/scheduler.pt
+++ b/checkpoint-556/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56935b15158c411b6c1d01c5776da2ee31f3bd7a9c997f7e81dcc87106ce1abc
+size 1465
--- a/checkpoint-556/tokenizer.json
+++ b/checkpoint-556/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920
--- a/checkpoint-556/tokenizer_config.json
+++ b/checkpoint-556/tokenizer_config.json
@@ -0,0 +1,14 @@
+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}
--- a/checkpoint-556/trainer_state.json
+++ b/checkpoint-556/trainer_state.json
@@ -0,0 +1,584 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 556,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.4901377972215415,
+      "epoch": 0.017992690469496767,
+      "grad_norm": 2.2936885356903076,
+      "learning_rate": 2.647058823529412e-06,
+      "loss": 1.6211814880371094,
+      "mean_token_accuracy": 0.6704532062634826,
+      "num_tokens": 1320132.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.276751457154751,
+      "epoch": 0.03598538093899353,
+      "grad_norm": 0.6068552732467651,
+      "learning_rate": 4.9814471243042675e-06,
+      "loss": 1.2583621978759765,
+      "mean_token_accuracy": 0.7108760349452495,
+      "num_tokens": 2628713.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.1512113714590668,
+      "epoch": 0.0539780714084903,
+      "grad_norm": 0.0513942688703537,
+      "learning_rate": 4.888682745825603e-06,
+      "loss": 1.1380729675292969,
+      "mean_token_accuracy": 0.7266728295013308,
+      "num_tokens": 3970015.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.1279350664466619,
+      "epoch": 0.07197076187798707,
+      "grad_norm": 0.054850462824106216,
+      "learning_rate": 4.795918367346939e-06,
+      "loss": 1.0975475311279297,
+      "mean_token_accuracy": 0.7291999347507954,
+      "num_tokens": 5293538.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.115758778527379,
+      "epoch": 0.08996345234748383,
+      "grad_norm": 0.04872556030750275,
+      "learning_rate": 4.7031539888682745e-06,
+      "loss": 1.0612051010131835,
+      "mean_token_accuracy": 0.7381280666217208,
+      "num_tokens": 6620811.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.1105864774435759,
+      "epoch": 0.1079561428169806,
+      "grad_norm": 0.04739998281002045,
+      "learning_rate": 4.610389610389611e-06,
+      "loss": 1.0470812797546387,
+      "mean_token_accuracy": 0.7379071025177837,
+      "num_tokens": 7936154.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.0977919282391668,
+      "epoch": 0.12594883328647738,
+      "grad_norm": 0.040797509253025055,
+      "learning_rate": 4.517625231910946e-06,
+      "loss": 1.0206071853637695,
+      "mean_token_accuracy": 0.741416247934103,
+      "num_tokens": 9258443.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.0766226774081589,
+      "epoch": 0.14394152375597413,
+      "grad_norm": 0.04117418825626373,
+      "learning_rate": 4.424860853432282e-06,
+      "loss": 1.0037202835083008,
+      "mean_token_accuracy": 0.7426602357998491,
+      "num_tokens": 10559451.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.0392444429919123,
+      "epoch": 0.1619342142254709,
+      "grad_norm": 0.03727104142308235,
+      "learning_rate": 4.332096474953618e-06,
+      "loss": 0.9694362640380859,
+      "mean_token_accuracy": 0.7481566898524761,
+      "num_tokens": 11859629.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.030888595432043,
+      "epoch": 0.17992690469496767,
+      "grad_norm": 0.0377194844186306,
+      "learning_rate": 4.239332096474954e-06,
+      "loss": 0.9774051666259765,
+      "mean_token_accuracy": 0.7471670845523477,
+      "num_tokens": 13170517.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.9908933199942112,
+      "epoch": 0.19791959516446445,
+      "grad_norm": 0.03397062420845032,
+      "learning_rate": 4.14656771799629e-06,
+      "loss": 0.9399270057678223,
+      "mean_token_accuracy": 0.7530543757602572,
+      "num_tokens": 14480196.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.9985105341300369,
+      "epoch": 0.2159122856339612,
+      "grad_norm": 0.038795359432697296,
+      "learning_rate": 4.053803339517626e-06,
+      "loss": 0.9471940994262695,
+      "mean_token_accuracy": 0.7546697033569216,
+      "num_tokens": 15807230.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.9790718862786889,
+      "epoch": 0.23390497610345798,
+      "grad_norm": 0.03815858066082001,
+      "learning_rate": 3.961038961038962e-06,
+      "loss": 0.925960922241211,
+      "mean_token_accuracy": 0.7591136118397117,
+      "num_tokens": 17157655.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.9843037761747837,
+      "epoch": 0.25189766657295476,
+      "grad_norm": 0.03516776114702225,
+      "learning_rate": 3.868274582560297e-06,
+      "loss": 0.9341155052185058,
+      "mean_token_accuracy": 0.7569106232374907,
+      "num_tokens": 18481580.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.9783661976456642,
+      "epoch": 0.2698903570424515,
+      "grad_norm": 0.034192971885204315,
+      "learning_rate": 3.7755102040816327e-06,
+      "loss": 0.918891716003418,
+      "mean_token_accuracy": 0.7582158392295242,
+      "num_tokens": 19792039.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.9883000548928976,
+      "epoch": 0.28788304751194826,
+      "grad_norm": 0.03616062551736832,
+      "learning_rate": 3.6827458256029685e-06,
+      "loss": 0.9350194931030273,
+      "mean_token_accuracy": 0.7552345667034388,
+      "num_tokens": 21132002.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.962575543858111,
+      "epoch": 0.305875737981445,
+      "grad_norm": 0.031624436378479004,
+      "learning_rate": 3.5899814471243043e-06,
+      "loss": 0.9099706649780274,
+      "mean_token_accuracy": 0.7614207146689296,
+      "num_tokens": 22456610.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.981575589068234,
+      "epoch": 0.3238684284509418,
+      "grad_norm": 0.03008902259171009,
+      "learning_rate": 3.49721706864564e-06,
+      "loss": 0.9275808334350586,
+      "mean_token_accuracy": 0.7563599238172174,
+      "num_tokens": 23784860.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.9543529843911529,
+      "epoch": 0.3418611189204386,
+      "grad_norm": 0.03235575929284096,
+      "learning_rate": 3.404452690166976e-06,
+      "loss": 0.9126798629760742,
+      "mean_token_accuracy": 0.7603292245417833,
+      "num_tokens": 25106610.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.9536242228001356,
+      "epoch": 0.35985380938993533,
+      "grad_norm": 0.033603642135858536,
+      "learning_rate": 3.311688311688312e-06,
+      "loss": 0.9094326019287109,
+      "mean_token_accuracy": 0.7603268170729279,
+      "num_tokens": 26404730.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.9402435509487986,
+      "epoch": 0.3778464998594321,
+      "grad_norm": 0.029900604858994484,
+      "learning_rate": 3.218923933209648e-06,
+      "loss": 0.8853635787963867,
+      "mean_token_accuracy": 0.7637220246717333,
+      "num_tokens": 27746430.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.9270002828910947,
+      "epoch": 0.3958391903289289,
+      "grad_norm": 0.03154909983277321,
+      "learning_rate": 3.1261595547309838e-06,
+      "loss": 0.8845057487487793,
+      "mean_token_accuracy": 0.7643253333866596,
+      "num_tokens": 29091240.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.9196253689005971,
+      "epoch": 0.41383188079842564,
+      "grad_norm": 0.028953028842806816,
+      "learning_rate": 3.0333951762523196e-06,
+      "loss": 0.880043888092041,
+      "mean_token_accuracy": 0.7643528375774622,
+      "num_tokens": 30412544.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.9138461783528328,
+      "epoch": 0.4318245712679224,
+      "grad_norm": 0.028740836307406425,
+      "learning_rate": 2.9406307977736554e-06,
+      "loss": 0.8804447174072265,
+      "mean_token_accuracy": 0.7650679206475616,
+      "num_tokens": 31721248.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.9258439548313617,
+      "epoch": 0.44981726173741915,
+      "grad_norm": 0.027906838804483414,
+      "learning_rate": 2.8478664192949912e-06,
+      "loss": 0.8891608238220214,
+      "mean_token_accuracy": 0.7623051449656486,
+      "num_tokens": 33030621.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.9231391252949834,
+      "epoch": 0.46780995220691596,
+      "grad_norm": 0.027720769867300987,
+      "learning_rate": 2.7551020408163266e-06,
+      "loss": 0.9020990371704102,
+      "mean_token_accuracy": 0.7595951380208135,
+      "num_tokens": 34328254.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.9248277079313993,
+      "epoch": 0.4858026426764127,
+      "grad_norm": 0.028005970641970634,
+      "learning_rate": 2.6623376623376624e-06,
+      "loss": 0.8968218803405762,
+      "mean_token_accuracy": 0.7620166089385748,
+      "num_tokens": 35639568.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.9164260600693523,
+      "epoch": 0.5037953331459095,
+      "grad_norm": 0.025676406919956207,
+      "learning_rate": 2.5695732838589982e-06,
+      "loss": 0.894569206237793,
+      "mean_token_accuracy": 0.7612657260149718,
+      "num_tokens": 36947904.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.9089541524648667,
+      "epoch": 0.5217880236154062,
+      "grad_norm": 0.028434382751584053,
+      "learning_rate": 2.476808905380334e-06,
+      "loss": 0.8868412017822266,
+      "mean_token_accuracy": 0.763394633680582,
+      "num_tokens": 38281521.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.9049528720788658,
+      "epoch": 0.539780714084903,
+      "grad_norm": 0.02663426101207733,
+      "learning_rate": 2.38404452690167e-06,
+      "loss": 0.8812618255615234,
+      "mean_token_accuracy": 0.7641567781567573,
+      "num_tokens": 39595803.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.900223555136472,
+      "epoch": 0.5577734045543997,
+      "grad_norm": 0.026907267048954964,
+      "learning_rate": 2.2912801484230057e-06,
+      "loss": 0.8773960113525391,
+      "mean_token_accuracy": 0.7646851245313883,
+      "num_tokens": 40918054.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.9072908268310129,
+      "epoch": 0.5757660950238965,
+      "grad_norm": 0.033084969967603683,
+      "learning_rate": 2.1985157699443415e-06,
+      "loss": 0.8849006652832031,
+      "mean_token_accuracy": 0.7633785914629698,
+      "num_tokens": 42245476.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.9075088860467077,
+      "epoch": 0.5937587854933933,
+      "grad_norm": 0.029511412605643272,
+      "learning_rate": 2.1057513914656773e-06,
+      "loss": 0.8799509048461914,
+      "mean_token_accuracy": 0.7644402593374252,
+      "num_tokens": 43592571.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.897929747030139,
+      "epoch": 0.61175147596289,
+      "grad_norm": 0.027747338637709618,
+      "learning_rate": 2.012987012987013e-06,
+      "loss": 0.8784950256347657,
+      "mean_token_accuracy": 0.7654943082481622,
+      "num_tokens": 44949762.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.8959064597263933,
+      "epoch": 0.6297441664323868,
+      "grad_norm": 0.02585972286760807,
+      "learning_rate": 1.920222634508349e-06,
+      "loss": 0.8677197456359863,
+      "mean_token_accuracy": 0.7666845623403787,
+      "num_tokens": 46266907.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.9085025515407323,
+      "epoch": 0.6477368569018837,
+      "grad_norm": 0.026946574449539185,
+      "learning_rate": 1.8274582560296848e-06,
+      "loss": 0.8925327301025391,
+      "mean_token_accuracy": 0.7623184407129884,
+      "num_tokens": 47577598.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.8742405578494072,
+      "epoch": 0.6657295473713803,
+      "grad_norm": 0.026929043233394623,
+      "learning_rate": 1.7346938775510206e-06,
+      "loss": 0.8524269104003906,
+      "mean_token_accuracy": 0.7705512259155511,
+      "num_tokens": 48888300.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.9005698974244296,
+      "epoch": 0.6837222378408772,
+      "grad_norm": 0.027014046907424927,
+      "learning_rate": 1.6419294990723564e-06,
+      "loss": 0.8712619781494141,
+      "mean_token_accuracy": 0.7643290877342224,
+      "num_tokens": 50229069.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.8819140480831266,
+      "epoch": 0.701714928310374,
+      "grad_norm": 0.028174864128232002,
+      "learning_rate": 1.5491651205936922e-06,
+      "loss": 0.8646106719970703,
+      "mean_token_accuracy": 0.7674408122897148,
+      "num_tokens": 51578947.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.8925842920318245,
+      "epoch": 0.7197076187798707,
+      "grad_norm": 0.027017617598176003,
+      "learning_rate": 1.456400742115028e-06,
+      "loss": 0.8714614868164062,
+      "mean_token_accuracy": 0.7669254776090384,
+      "num_tokens": 52930805.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.889844935759902,
+      "epoch": 0.7377003092493675,
+      "grad_norm": 0.02721812203526497,
+      "learning_rate": 1.3636363636363636e-06,
+      "loss": 0.8674912452697754,
+      "mean_token_accuracy": 0.7662461360916495,
+      "num_tokens": 54224294.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.8719520575366915,
+      "epoch": 0.7556929997188642,
+      "grad_norm": 0.028012819588184357,
+      "learning_rate": 1.2708719851576994e-06,
+      "loss": 0.8511224746704101,
+      "mean_token_accuracy": 0.7702083302661776,
+      "num_tokens": 55540584.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.8898111075162888,
+      "epoch": 0.773685690188361,
+      "grad_norm": 0.02642475627362728,
+      "learning_rate": 1.1781076066790352e-06,
+      "loss": 0.8730297088623047,
+      "mean_token_accuracy": 0.7653367448598146,
+      "num_tokens": 56827841.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.8857162812724709,
+      "epoch": 0.7916783806578578,
+      "grad_norm": 0.02740148827433586,
+      "learning_rate": 1.0853432282003713e-06,
+      "loss": 0.8713733673095703,
+      "mean_token_accuracy": 0.7659575197845697,
+      "num_tokens": 58130682.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.8843438906595111,
+      "epoch": 0.8096710711273545,
+      "grad_norm": 0.025668496266007423,
+      "learning_rate": 9.925788497217069e-07,
+      "loss": 0.8760784149169922,
+      "mean_token_accuracy": 0.7651905825361609,
+      "num_tokens": 59444140.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.876284147053957,
+      "epoch": 0.8276637615968513,
+      "grad_norm": 0.026019152253866196,
+      "learning_rate": 8.998144712430428e-07,
+      "loss": 0.8590941429138184,
+      "mean_token_accuracy": 0.7688775883987546,
+      "num_tokens": 60778522.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.8704025126062334,
+      "epoch": 0.8456564520663481,
+      "grad_norm": 0.024385536089539528,
+      "learning_rate": 8.070500927643786e-07,
+      "loss": 0.8481533050537109,
+      "mean_token_accuracy": 0.7709953064098954,
+      "num_tokens": 62138075.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.886689430475235,
+      "epoch": 0.8636491425358448,
+      "grad_norm": 0.027147600427269936,
+      "learning_rate": 7.142857142857143e-07,
+      "loss": 0.8655129432678222,
+      "mean_token_accuracy": 0.7670928187668323,
+      "num_tokens": 63450349.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.8841921042650938,
+      "epoch": 0.8816418330053416,
+      "grad_norm": 0.025846796110272408,
+      "learning_rate": 6.215213358070501e-07,
+      "loss": 0.8744302749633789,
+      "mean_token_accuracy": 0.7654220588505268,
+      "num_tokens": 64770576.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.8944361335597932,
+      "epoch": 0.8996345234748383,
+      "grad_norm": 0.025025852024555206,
+      "learning_rate": 5.287569573283859e-07,
+      "loss": 0.8789453506469727,
+      "mean_token_accuracy": 0.7639346193522215,
+      "num_tokens": 66113087.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.8843724082224071,
+      "epoch": 0.9176272139443351,
+      "grad_norm": 0.02651493437588215,
+      "learning_rate": 4.359925788497217e-07,
+      "loss": 0.8675421714782715,
+      "mean_token_accuracy": 0.7664000844582916,
+      "num_tokens": 67464302.0,
+      "step": 510
+    },
+    {
+      "entropy": 0.8899071650579572,
+      "epoch": 0.9356199044138319,
+      "grad_norm": 0.025058092549443245,
+      "learning_rate": 3.4322820037105757e-07,
+      "loss": 0.879638385772705,
+      "mean_token_accuracy": 0.7650359075516462,
+      "num_tokens": 68809443.0,
+      "step": 520
+    },
+    {
+      "entropy": 0.8678001549094916,
+      "epoch": 0.9536125948833286,
+      "grad_norm": 0.025574836879968643,
+      "learning_rate": 2.5046382189239333e-07,
+      "loss": 0.8517162322998046,
+      "mean_token_accuracy": 0.7706384485587477,
+      "num_tokens": 70130884.0,
+      "step": 530
+    },
+    {
+      "entropy": 0.8980348063632846,
+      "epoch": 0.9716052853528254,
+      "grad_norm": 0.02690030448138714,
+      "learning_rate": 1.5769944341372915e-07,
+      "loss": 0.8926727294921875,
+      "mean_token_accuracy": 0.7621918022632599,
+      "num_tokens": 71446103.0,
+      "step": 540
+    },
+    {
+      "entropy": 0.8809462685137988,
+      "epoch": 0.9895979758223222,
+      "grad_norm": 0.02480347640812397,
+      "learning_rate": 6.493506493506495e-08,
+      "loss": 0.8590832710266113,
+      "mean_token_accuracy": 0.7687337175011635,
+      "num_tokens": 72793622.0,
+      "step": 550
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 556,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.0599989240114708e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/checkpoint-556/training_args.bin
+++ b/checkpoint-556/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7238b353eb99d116ecb084fb9fed131ddd6214e745694796c3a48165bc8ba1a9
+size 6033