初始化项目,由ModelHub XC社区提供模型
Model: mremila/Llama-3.1-8B-coding Source: Original Platform
This commit is contained in:
109
checkpoint-556/chat_template.jinja
Normal file
109
checkpoint-556/chat_template.jinja
Normal file
@@ -0,0 +1,109 @@
|
||||
{{- bos_token }}
|
||||
{%- if custom_tools is defined %}
|
||||
{%- set tools = custom_tools %}
|
||||
{%- endif %}
|
||||
{%- if not tools_in_user_message is defined %}
|
||||
{%- set tools_in_user_message = true %}
|
||||
{%- endif %}
|
||||
{%- if not date_string is defined %}
|
||||
{%- set date_string = "26 Jul 2024" %}
|
||||
{%- endif %}
|
||||
{%- if not tools is defined %}
|
||||
{%- set tools = none %}
|
||||
{%- endif %}
|
||||
|
||||
{#- This block extracts the system message, so we can slot it into the right place. #}
|
||||
{%- if messages[0]['role'] == 'system' %}
|
||||
{%- set system_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{%- set system_message = "" %}
|
||||
{%- endif %}
|
||||
|
||||
{#- System message + builtin tools #}
|
||||
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
|
||||
{%- if builtin_tools is defined or tools is not none %}
|
||||
{{- "Environment: ipython\n" }}
|
||||
{%- endif %}
|
||||
{%- if builtin_tools is defined %}
|
||||
{{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
|
||||
{%- endif %}
|
||||
{{- "Cutting Knowledge Date: December 2023\n" }}
|
||||
{{- "Today Date: " + date_string + "\n\n" }}
|
||||
{%- if tools is not none and not tools_in_user_message %}
|
||||
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- system_message }}
|
||||
{{- "<|eot_id|>" }}
|
||||
|
||||
{#- Custom tools are passed in a user message with some extra guidance #}
|
||||
{%- if tools_in_user_message and not tools is none %}
|
||||
{#- Extract the first user message so we can plug it in here #}
|
||||
{%- if messages | length != 0 %}
|
||||
{%- set first_user_message = messages[0]['content']|trim %}
|
||||
{%- set messages = messages[1:] %}
|
||||
{%- else %}
|
||||
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
|
||||
{%- endif %}
|
||||
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
|
||||
{{- "Given the following functions, please respond with a JSON for a function call " }}
|
||||
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
|
||||
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
|
||||
{{- "Do not use variables.\n\n" }}
|
||||
{%- for t in tools %}
|
||||
{{- t | tojson(indent=4) }}
|
||||
{{- "\n\n" }}
|
||||
{%- endfor %}
|
||||
{{- first_user_message + "<|eot_id|>"}}
|
||||
{%- endif %}
|
||||
|
||||
{%- for message in messages %}
|
||||
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
|
||||
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
|
||||
{%- elif 'tool_calls' in message %}
|
||||
{%- if not message.tool_calls|length == 1 %}
|
||||
{{- raise_exception("This model only supports single tool-calls at once!") }}
|
||||
{%- endif %}
|
||||
{%- set tool_call = message.tool_calls[0].function %}
|
||||
{%- if builtin_tools is defined and tool_call.name in builtin_tools %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
|
||||
{{- "<|python_tag|>" + tool_call.name + ".call(" }}
|
||||
{%- for arg_name, arg_val in tool_call.arguments | items %}
|
||||
{{- arg_name + '="' + arg_val + '"' }}
|
||||
{%- if not loop.last %}
|
||||
{{- ", " }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{{- ")" }}
|
||||
{%- else %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
|
||||
{{- '{"name": "' + tool_call.name + '", ' }}
|
||||
{{- '"parameters": ' }}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{{- "}" }}
|
||||
{%- endif %}
|
||||
{%- if builtin_tools is defined %}
|
||||
{#- This means we're in ipython mode #}
|
||||
{{- "<|eom_id|>" }}
|
||||
{%- else %}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- endif %}
|
||||
{%- elif message.role == "tool" or message.role == "ipython" %}
|
||||
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
|
||||
{%- if message.content is mapping or message.content is iterable %}
|
||||
{{- message.content | tojson }}
|
||||
{%- else %}
|
||||
{{- message.content }}
|
||||
{%- endif %}
|
||||
{{- "<|eot_id|>" }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
|
||||
{%- endif %}
|
||||
36
checkpoint-556/config.json
Normal file
36
checkpoint-556/config.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"architectures": [
|
||||
"LlamaForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": 128000,
|
||||
"dtype": "float32",
|
||||
"eos_token_id": 128009,
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 4096,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 14336,
|
||||
"max_position_embeddings": 131072,
|
||||
"mlp_bias": false,
|
||||
"model_type": "llama",
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 32,
|
||||
"num_key_value_heads": 8,
|
||||
"pad_token_id": 128009,
|
||||
"pretraining_tp": 1,
|
||||
"rms_norm_eps": 1e-05,
|
||||
"rope_parameters": {
|
||||
"factor": 8.0,
|
||||
"high_freq_factor": 4.0,
|
||||
"low_freq_factor": 1.0,
|
||||
"original_max_position_embeddings": 8192,
|
||||
"rope_theta": 500000.0,
|
||||
"rope_type": "llama3"
|
||||
},
|
||||
"tie_word_embeddings": false,
|
||||
"transformers_version": "5.3.0",
|
||||
"use_cache": false,
|
||||
"vocab_size": 128256
|
||||
}
|
||||
13
checkpoint-556/generation_config.json
Normal file
13
checkpoint-556/generation_config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"_from_model_config": true,
|
||||
"bos_token_id": 128000,
|
||||
"do_sample": true,
|
||||
"eos_token_id": [
|
||||
128009,
|
||||
128001
|
||||
],
|
||||
"pad_token_id": 128009,
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.9,
|
||||
"transformers_version": "5.3.0"
|
||||
}
|
||||
3
checkpoint-556/model.safetensors
Normal file
3
checkpoint-556/model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:93b11399090fa2a1cdd668d929bf43250eadc83605451a6acc342987cca076bf
|
||||
size 32121079032
|
||||
3
checkpoint-556/optimizer.bin
Normal file
3
checkpoint-556/optimizer.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:09f21324b98a9e73906291c7f1387bce209d66fceae76985a053dcee2dcdf022
|
||||
size 64242369179
|
||||
3
checkpoint-556/pytorch_model_fsdp.bin
Normal file
3
checkpoint-556/pytorch_model_fsdp.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:0e63c7c50ab8cc828f2ec1b1acd0b7537f5df2413f04d24aab3570df776c33d8
|
||||
size 32121192148
|
||||
3
checkpoint-556/rng_state_0.pth
Normal file
3
checkpoint-556/rng_state_0.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:659b1cdee2219458dd84ce6a632a595465680b8080e5c44bd600ff97eca8d752
|
||||
size 15429
|
||||
3
checkpoint-556/rng_state_1.pth
Normal file
3
checkpoint-556/rng_state_1.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:86accf27064cdd503053e90476a6bd10de333d4ff0594535ad55ea13a473c91d
|
||||
size 15429
|
||||
3
checkpoint-556/rng_state_2.pth
Normal file
3
checkpoint-556/rng_state_2.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:18ca8d714ef40be035404c1957b5a4dee84e1f43980408393f8aa710552ee6f6
|
||||
size 15429
|
||||
3
checkpoint-556/rng_state_3.pth
Normal file
3
checkpoint-556/rng_state_3.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2cfdebe99e40accc9c9d8f09c63136a14abda997d9b501969ec8e16e9d183179
|
||||
size 15429
|
||||
3
checkpoint-556/scheduler.pt
Normal file
3
checkpoint-556/scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:56935b15158c411b6c1d01c5776da2ee31f3bd7a9c997f7e81dcc87106ce1abc
|
||||
size 1465
|
||||
3
checkpoint-556/tokenizer.json
Normal file
3
checkpoint-556/tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
|
||||
size 17209920
|
||||
14
checkpoint-556/tokenizer_config.json
Normal file
14
checkpoint-556/tokenizer_config.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"backend": "tokenizers",
|
||||
"bos_token": "<|begin_of_text|>",
|
||||
"clean_up_tokenization_spaces": true,
|
||||
"eos_token": "<|eot_id|>",
|
||||
"is_local": false,
|
||||
"model_input_names": [
|
||||
"input_ids",
|
||||
"attention_mask"
|
||||
],
|
||||
"model_max_length": 131072,
|
||||
"pad_token": "<|eot_id|>",
|
||||
"tokenizer_class": "TokenizersBackend"
|
||||
}
|
||||
584
checkpoint-556/trainer_state.json
Normal file
584
checkpoint-556/trainer_state.json
Normal file
@@ -0,0 +1,584 @@
|
||||
{
|
||||
"best_global_step": null,
|
||||
"best_metric": null,
|
||||
"best_model_checkpoint": null,
|
||||
"epoch": 1.0,
|
||||
"eval_steps": 500,
|
||||
"global_step": 556,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"entropy": 1.4901377972215415,
|
||||
"epoch": 0.017992690469496767,
|
||||
"grad_norm": 2.2936885356903076,
|
||||
"learning_rate": 2.647058823529412e-06,
|
||||
"loss": 1.6211814880371094,
|
||||
"mean_token_accuracy": 0.6704532062634826,
|
||||
"num_tokens": 1320132.0,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"entropy": 1.276751457154751,
|
||||
"epoch": 0.03598538093899353,
|
||||
"grad_norm": 0.6068552732467651,
|
||||
"learning_rate": 4.9814471243042675e-06,
|
||||
"loss": 1.2583621978759765,
|
||||
"mean_token_accuracy": 0.7108760349452495,
|
||||
"num_tokens": 2628713.0,
|
||||
"step": 20
|
||||
},
|
||||
{
|
||||
"entropy": 1.1512113714590668,
|
||||
"epoch": 0.0539780714084903,
|
||||
"grad_norm": 0.0513942688703537,
|
||||
"learning_rate": 4.888682745825603e-06,
|
||||
"loss": 1.1380729675292969,
|
||||
"mean_token_accuracy": 0.7266728295013308,
|
||||
"num_tokens": 3970015.0,
|
||||
"step": 30
|
||||
},
|
||||
{
|
||||
"entropy": 1.1279350664466619,
|
||||
"epoch": 0.07197076187798707,
|
||||
"grad_norm": 0.054850462824106216,
|
||||
"learning_rate": 4.795918367346939e-06,
|
||||
"loss": 1.0975475311279297,
|
||||
"mean_token_accuracy": 0.7291999347507954,
|
||||
"num_tokens": 5293538.0,
|
||||
"step": 40
|
||||
},
|
||||
{
|
||||
"entropy": 1.115758778527379,
|
||||
"epoch": 0.08996345234748383,
|
||||
"grad_norm": 0.04872556030750275,
|
||||
"learning_rate": 4.7031539888682745e-06,
|
||||
"loss": 1.0612051010131835,
|
||||
"mean_token_accuracy": 0.7381280666217208,
|
||||
"num_tokens": 6620811.0,
|
||||
"step": 50
|
||||
},
|
||||
{
|
||||
"entropy": 1.1105864774435759,
|
||||
"epoch": 0.1079561428169806,
|
||||
"grad_norm": 0.04739998281002045,
|
||||
"learning_rate": 4.610389610389611e-06,
|
||||
"loss": 1.0470812797546387,
|
||||
"mean_token_accuracy": 0.7379071025177837,
|
||||
"num_tokens": 7936154.0,
|
||||
"step": 60
|
||||
},
|
||||
{
|
||||
"entropy": 1.0977919282391668,
|
||||
"epoch": 0.12594883328647738,
|
||||
"grad_norm": 0.040797509253025055,
|
||||
"learning_rate": 4.517625231910946e-06,
|
||||
"loss": 1.0206071853637695,
|
||||
"mean_token_accuracy": 0.741416247934103,
|
||||
"num_tokens": 9258443.0,
|
||||
"step": 70
|
||||
},
|
||||
{
|
||||
"entropy": 1.0766226774081589,
|
||||
"epoch": 0.14394152375597413,
|
||||
"grad_norm": 0.04117418825626373,
|
||||
"learning_rate": 4.424860853432282e-06,
|
||||
"loss": 1.0037202835083008,
|
||||
"mean_token_accuracy": 0.7426602357998491,
|
||||
"num_tokens": 10559451.0,
|
||||
"step": 80
|
||||
},
|
||||
{
|
||||
"entropy": 1.0392444429919123,
|
||||
"epoch": 0.1619342142254709,
|
||||
"grad_norm": 0.03727104142308235,
|
||||
"learning_rate": 4.332096474953618e-06,
|
||||
"loss": 0.9694362640380859,
|
||||
"mean_token_accuracy": 0.7481566898524761,
|
||||
"num_tokens": 11859629.0,
|
||||
"step": 90
|
||||
},
|
||||
{
|
||||
"entropy": 1.030888595432043,
|
||||
"epoch": 0.17992690469496767,
|
||||
"grad_norm": 0.0377194844186306,
|
||||
"learning_rate": 4.239332096474954e-06,
|
||||
"loss": 0.9774051666259765,
|
||||
"mean_token_accuracy": 0.7471670845523477,
|
||||
"num_tokens": 13170517.0,
|
||||
"step": 100
|
||||
},
|
||||
{
|
||||
"entropy": 0.9908933199942112,
|
||||
"epoch": 0.19791959516446445,
|
||||
"grad_norm": 0.03397062420845032,
|
||||
"learning_rate": 4.14656771799629e-06,
|
||||
"loss": 0.9399270057678223,
|
||||
"mean_token_accuracy": 0.7530543757602572,
|
||||
"num_tokens": 14480196.0,
|
||||
"step": 110
|
||||
},
|
||||
{
|
||||
"entropy": 0.9985105341300369,
|
||||
"epoch": 0.2159122856339612,
|
||||
"grad_norm": 0.038795359432697296,
|
||||
"learning_rate": 4.053803339517626e-06,
|
||||
"loss": 0.9471940994262695,
|
||||
"mean_token_accuracy": 0.7546697033569216,
|
||||
"num_tokens": 15807230.0,
|
||||
"step": 120
|
||||
},
|
||||
{
|
||||
"entropy": 0.9790718862786889,
|
||||
"epoch": 0.23390497610345798,
|
||||
"grad_norm": 0.03815858066082001,
|
||||
"learning_rate": 3.961038961038962e-06,
|
||||
"loss": 0.925960922241211,
|
||||
"mean_token_accuracy": 0.7591136118397117,
|
||||
"num_tokens": 17157655.0,
|
||||
"step": 130
|
||||
},
|
||||
{
|
||||
"entropy": 0.9843037761747837,
|
||||
"epoch": 0.25189766657295476,
|
||||
"grad_norm": 0.03516776114702225,
|
||||
"learning_rate": 3.868274582560297e-06,
|
||||
"loss": 0.9341155052185058,
|
||||
"mean_token_accuracy": 0.7569106232374907,
|
||||
"num_tokens": 18481580.0,
|
||||
"step": 140
|
||||
},
|
||||
{
|
||||
"entropy": 0.9783661976456642,
|
||||
"epoch": 0.2698903570424515,
|
||||
"grad_norm": 0.034192971885204315,
|
||||
"learning_rate": 3.7755102040816327e-06,
|
||||
"loss": 0.918891716003418,
|
||||
"mean_token_accuracy": 0.7582158392295242,
|
||||
"num_tokens": 19792039.0,
|
||||
"step": 150
|
||||
},
|
||||
{
|
||||
"entropy": 0.9883000548928976,
|
||||
"epoch": 0.28788304751194826,
|
||||
"grad_norm": 0.03616062551736832,
|
||||
"learning_rate": 3.6827458256029685e-06,
|
||||
"loss": 0.9350194931030273,
|
||||
"mean_token_accuracy": 0.7552345667034388,
|
||||
"num_tokens": 21132002.0,
|
||||
"step": 160
|
||||
},
|
||||
{
|
||||
"entropy": 0.962575543858111,
|
||||
"epoch": 0.305875737981445,
|
||||
"grad_norm": 0.031624436378479004,
|
||||
"learning_rate": 3.5899814471243043e-06,
|
||||
"loss": 0.9099706649780274,
|
||||
"mean_token_accuracy": 0.7614207146689296,
|
||||
"num_tokens": 22456610.0,
|
||||
"step": 170
|
||||
},
|
||||
{
|
||||
"entropy": 0.981575589068234,
|
||||
"epoch": 0.3238684284509418,
|
||||
"grad_norm": 0.03008902259171009,
|
||||
"learning_rate": 3.49721706864564e-06,
|
||||
"loss": 0.9275808334350586,
|
||||
"mean_token_accuracy": 0.7563599238172174,
|
||||
"num_tokens": 23784860.0,
|
||||
"step": 180
|
||||
},
|
||||
{
|
||||
"entropy": 0.9543529843911529,
|
||||
"epoch": 0.3418611189204386,
|
||||
"grad_norm": 0.03235575929284096,
|
||||
"learning_rate": 3.404452690166976e-06,
|
||||
"loss": 0.9126798629760742,
|
||||
"mean_token_accuracy": 0.7603292245417833,
|
||||
"num_tokens": 25106610.0,
|
||||
"step": 190
|
||||
},
|
||||
{
|
||||
"entropy": 0.9536242228001356,
|
||||
"epoch": 0.35985380938993533,
|
||||
"grad_norm": 0.033603642135858536,
|
||||
"learning_rate": 3.311688311688312e-06,
|
||||
"loss": 0.9094326019287109,
|
||||
"mean_token_accuracy": 0.7603268170729279,
|
||||
"num_tokens": 26404730.0,
|
||||
"step": 200
|
||||
},
|
||||
{
|
||||
"entropy": 0.9402435509487986,
|
||||
"epoch": 0.3778464998594321,
|
||||
"grad_norm": 0.029900604858994484,
|
||||
"learning_rate": 3.218923933209648e-06,
|
||||
"loss": 0.8853635787963867,
|
||||
"mean_token_accuracy": 0.7637220246717333,
|
||||
"num_tokens": 27746430.0,
|
||||
"step": 210
|
||||
},
|
||||
{
|
||||
"entropy": 0.9270002828910947,
|
||||
"epoch": 0.3958391903289289,
|
||||
"grad_norm": 0.03154909983277321,
|
||||
"learning_rate": 3.1261595547309838e-06,
|
||||
"loss": 0.8845057487487793,
|
||||
"mean_token_accuracy": 0.7643253333866596,
|
||||
"num_tokens": 29091240.0,
|
||||
"step": 220
|
||||
},
|
||||
{
|
||||
"entropy": 0.9196253689005971,
|
||||
"epoch": 0.41383188079842564,
|
||||
"grad_norm": 0.028953028842806816,
|
||||
"learning_rate": 3.0333951762523196e-06,
|
||||
"loss": 0.880043888092041,
|
||||
"mean_token_accuracy": 0.7643528375774622,
|
||||
"num_tokens": 30412544.0,
|
||||
"step": 230
|
||||
},
|
||||
{
|
||||
"entropy": 0.9138461783528328,
|
||||
"epoch": 0.4318245712679224,
|
||||
"grad_norm": 0.028740836307406425,
|
||||
"learning_rate": 2.9406307977736554e-06,
|
||||
"loss": 0.8804447174072265,
|
||||
"mean_token_accuracy": 0.7650679206475616,
|
||||
"num_tokens": 31721248.0,
|
||||
"step": 240
|
||||
},
|
||||
{
|
||||
"entropy": 0.9258439548313617,
|
||||
"epoch": 0.44981726173741915,
|
||||
"grad_norm": 0.027906838804483414,
|
||||
"learning_rate": 2.8478664192949912e-06,
|
||||
"loss": 0.8891608238220214,
|
||||
"mean_token_accuracy": 0.7623051449656486,
|
||||
"num_tokens": 33030621.0,
|
||||
"step": 250
|
||||
},
|
||||
{
|
||||
"entropy": 0.9231391252949834,
|
||||
"epoch": 0.46780995220691596,
|
||||
"grad_norm": 0.027720769867300987,
|
||||
"learning_rate": 2.7551020408163266e-06,
|
||||
"loss": 0.9020990371704102,
|
||||
"mean_token_accuracy": 0.7595951380208135,
|
||||
"num_tokens": 34328254.0,
|
||||
"step": 260
|
||||
},
|
||||
{
|
||||
"entropy": 0.9248277079313993,
|
||||
"epoch": 0.4858026426764127,
|
||||
"grad_norm": 0.028005970641970634,
|
||||
"learning_rate": 2.6623376623376624e-06,
|
||||
"loss": 0.8968218803405762,
|
||||
"mean_token_accuracy": 0.7620166089385748,
|
||||
"num_tokens": 35639568.0,
|
||||
"step": 270
|
||||
},
|
||||
{
|
||||
"entropy": 0.9164260600693523,
|
||||
"epoch": 0.5037953331459095,
|
||||
"grad_norm": 0.025676406919956207,
|
||||
"learning_rate": 2.5695732838589982e-06,
|
||||
"loss": 0.894569206237793,
|
||||
"mean_token_accuracy": 0.7612657260149718,
|
||||
"num_tokens": 36947904.0,
|
||||
"step": 280
|
||||
},
|
||||
{
|
||||
"entropy": 0.9089541524648667,
|
||||
"epoch": 0.5217880236154062,
|
||||
"grad_norm": 0.028434382751584053,
|
||||
"learning_rate": 2.476808905380334e-06,
|
||||
"loss": 0.8868412017822266,
|
||||
"mean_token_accuracy": 0.763394633680582,
|
||||
"num_tokens": 38281521.0,
|
||||
"step": 290
|
||||
},
|
||||
{
|
||||
"entropy": 0.9049528720788658,
|
||||
"epoch": 0.539780714084903,
|
||||
"grad_norm": 0.02663426101207733,
|
||||
"learning_rate": 2.38404452690167e-06,
|
||||
"loss": 0.8812618255615234,
|
||||
"mean_token_accuracy": 0.7641567781567573,
|
||||
"num_tokens": 39595803.0,
|
||||
"step": 300
|
||||
},
|
||||
{
|
||||
"entropy": 0.900223555136472,
|
||||
"epoch": 0.5577734045543997,
|
||||
"grad_norm": 0.026907267048954964,
|
||||
"learning_rate": 2.2912801484230057e-06,
|
||||
"loss": 0.8773960113525391,
|
||||
"mean_token_accuracy": 0.7646851245313883,
|
||||
"num_tokens": 40918054.0,
|
||||
"step": 310
|
||||
},
|
||||
{
|
||||
"entropy": 0.9072908268310129,
|
||||
"epoch": 0.5757660950238965,
|
||||
"grad_norm": 0.033084969967603683,
|
||||
"learning_rate": 2.1985157699443415e-06,
|
||||
"loss": 0.8849006652832031,
|
||||
"mean_token_accuracy": 0.7633785914629698,
|
||||
"num_tokens": 42245476.0,
|
||||
"step": 320
|
||||
},
|
||||
{
|
||||
"entropy": 0.9075088860467077,
|
||||
"epoch": 0.5937587854933933,
|
||||
"grad_norm": 0.029511412605643272,
|
||||
"learning_rate": 2.1057513914656773e-06,
|
||||
"loss": 0.8799509048461914,
|
||||
"mean_token_accuracy": 0.7644402593374252,
|
||||
"num_tokens": 43592571.0,
|
||||
"step": 330
|
||||
},
|
||||
{
|
||||
"entropy": 0.897929747030139,
|
||||
"epoch": 0.61175147596289,
|
||||
"grad_norm": 0.027747338637709618,
|
||||
"learning_rate": 2.012987012987013e-06,
|
||||
"loss": 0.8784950256347657,
|
||||
"mean_token_accuracy": 0.7654943082481622,
|
||||
"num_tokens": 44949762.0,
|
||||
"step": 340
|
||||
},
|
||||
{
|
||||
"entropy": 0.8959064597263933,
|
||||
"epoch": 0.6297441664323868,
|
||||
"grad_norm": 0.02585972286760807,
|
||||
"learning_rate": 1.920222634508349e-06,
|
||||
"loss": 0.8677197456359863,
|
||||
"mean_token_accuracy": 0.7666845623403787,
|
||||
"num_tokens": 46266907.0,
|
||||
"step": 350
|
||||
},
|
||||
{
|
||||
"entropy": 0.9085025515407323,
|
||||
"epoch": 0.6477368569018837,
|
||||
"grad_norm": 0.026946574449539185,
|
||||
"learning_rate": 1.8274582560296848e-06,
|
||||
"loss": 0.8925327301025391,
|
||||
"mean_token_accuracy": 0.7623184407129884,
|
||||
"num_tokens": 47577598.0,
|
||||
"step": 360
|
||||
},
|
||||
{
|
||||
"entropy": 0.8742405578494072,
|
||||
"epoch": 0.6657295473713803,
|
||||
"grad_norm": 0.026929043233394623,
|
||||
"learning_rate": 1.7346938775510206e-06,
|
||||
"loss": 0.8524269104003906,
|
||||
"mean_token_accuracy": 0.7705512259155511,
|
||||
"num_tokens": 48888300.0,
|
||||
"step": 370
|
||||
},
|
||||
{
|
||||
"entropy": 0.9005698974244296,
|
||||
"epoch": 0.6837222378408772,
|
||||
"grad_norm": 0.027014046907424927,
|
||||
"learning_rate": 1.6419294990723564e-06,
|
||||
"loss": 0.8712619781494141,
|
||||
"mean_token_accuracy": 0.7643290877342224,
|
||||
"num_tokens": 50229069.0,
|
||||
"step": 380
|
||||
},
|
||||
{
|
||||
"entropy": 0.8819140480831266,
|
||||
"epoch": 0.701714928310374,
|
||||
"grad_norm": 0.028174864128232002,
|
||||
"learning_rate": 1.5491651205936922e-06,
|
||||
"loss": 0.8646106719970703,
|
||||
"mean_token_accuracy": 0.7674408122897148,
|
||||
"num_tokens": 51578947.0,
|
||||
"step": 390
|
||||
},
|
||||
{
|
||||
"entropy": 0.8925842920318245,
|
||||
"epoch": 0.7197076187798707,
|
||||
"grad_norm": 0.027017617598176003,
|
||||
"learning_rate": 1.456400742115028e-06,
|
||||
"loss": 0.8714614868164062,
|
||||
"mean_token_accuracy": 0.7669254776090384,
|
||||
"num_tokens": 52930805.0,
|
||||
"step": 400
|
||||
},
|
||||
{
|
||||
"entropy": 0.889844935759902,
|
||||
"epoch": 0.7377003092493675,
|
||||
"grad_norm": 0.02721812203526497,
|
||||
"learning_rate": 1.3636363636363636e-06,
|
||||
"loss": 0.8674912452697754,
|
||||
"mean_token_accuracy": 0.7662461360916495,
|
||||
"num_tokens": 54224294.0,
|
||||
"step": 410
|
||||
},
|
||||
{
|
||||
"entropy": 0.8719520575366915,
|
||||
"epoch": 0.7556929997188642,
|
||||
"grad_norm": 0.028012819588184357,
|
||||
"learning_rate": 1.2708719851576994e-06,
|
||||
"loss": 0.8511224746704101,
|
||||
"mean_token_accuracy": 0.7702083302661776,
|
||||
"num_tokens": 55540584.0,
|
||||
"step": 420
|
||||
},
|
||||
{
|
||||
"entropy": 0.8898111075162888,
|
||||
"epoch": 0.773685690188361,
|
||||
"grad_norm": 0.02642475627362728,
|
||||
"learning_rate": 1.1781076066790352e-06,
|
||||
"loss": 0.8730297088623047,
|
||||
"mean_token_accuracy": 0.7653367448598146,
|
||||
"num_tokens": 56827841.0,
|
||||
"step": 430
|
||||
},
|
||||
{
|
||||
"entropy": 0.8857162812724709,
|
||||
"epoch": 0.7916783806578578,
|
||||
"grad_norm": 0.02740148827433586,
|
||||
"learning_rate": 1.0853432282003713e-06,
|
||||
"loss": 0.8713733673095703,
|
||||
"mean_token_accuracy": 0.7659575197845697,
|
||||
"num_tokens": 58130682.0,
|
||||
"step": 440
|
||||
},
|
||||
{
|
||||
"entropy": 0.8843438906595111,
|
||||
"epoch": 0.8096710711273545,
|
||||
"grad_norm": 0.025668496266007423,
|
||||
"learning_rate": 9.925788497217069e-07,
|
||||
"loss": 0.8760784149169922,
|
||||
"mean_token_accuracy": 0.7651905825361609,
|
||||
"num_tokens": 59444140.0,
|
||||
"step": 450
|
||||
},
|
||||
{
|
||||
"entropy": 0.876284147053957,
|
||||
"epoch": 0.8276637615968513,
|
||||
"grad_norm": 0.026019152253866196,
|
||||
"learning_rate": 8.998144712430428e-07,
|
||||
"loss": 0.8590941429138184,
|
||||
"mean_token_accuracy": 0.7688775883987546,
|
||||
"num_tokens": 60778522.0,
|
||||
"step": 460
|
||||
},
|
||||
{
|
||||
"entropy": 0.8704025126062334,
|
||||
"epoch": 0.8456564520663481,
|
||||
"grad_norm": 0.024385536089539528,
|
||||
"learning_rate": 8.070500927643786e-07,
|
||||
"loss": 0.8481533050537109,
|
||||
"mean_token_accuracy": 0.7709953064098954,
|
||||
"num_tokens": 62138075.0,
|
||||
"step": 470
|
||||
},
|
||||
{
|
||||
"entropy": 0.886689430475235,
|
||||
"epoch": 0.8636491425358448,
|
||||
"grad_norm": 0.027147600427269936,
|
||||
"learning_rate": 7.142857142857143e-07,
|
||||
"loss": 0.8655129432678222,
|
||||
"mean_token_accuracy": 0.7670928187668323,
|
||||
"num_tokens": 63450349.0,
|
||||
"step": 480
|
||||
},
|
||||
{
|
||||
"entropy": 0.8841921042650938,
|
||||
"epoch": 0.8816418330053416,
|
||||
"grad_norm": 0.025846796110272408,
|
||||
"learning_rate": 6.215213358070501e-07,
|
||||
"loss": 0.8744302749633789,
|
||||
"mean_token_accuracy": 0.7654220588505268,
|
||||
"num_tokens": 64770576.0,
|
||||
"step": 490
|
||||
},
|
||||
{
|
||||
"entropy": 0.8944361335597932,
|
||||
"epoch": 0.8996345234748383,
|
||||
"grad_norm": 0.025025852024555206,
|
||||
"learning_rate": 5.287569573283859e-07,
|
||||
"loss": 0.8789453506469727,
|
||||
"mean_token_accuracy": 0.7639346193522215,
|
||||
"num_tokens": 66113087.0,
|
||||
"step": 500
|
||||
},
|
||||
{
|
||||
"entropy": 0.8843724082224071,
|
||||
"epoch": 0.9176272139443351,
|
||||
"grad_norm": 0.02651493437588215,
|
||||
"learning_rate": 4.359925788497217e-07,
|
||||
"loss": 0.8675421714782715,
|
||||
"mean_token_accuracy": 0.7664000844582916,
|
||||
"num_tokens": 67464302.0,
|
||||
"step": 510
|
||||
},
|
||||
{
|
||||
"entropy": 0.8899071650579572,
|
||||
"epoch": 0.9356199044138319,
|
||||
"grad_norm": 0.025058092549443245,
|
||||
"learning_rate": 3.4322820037105757e-07,
|
||||
"loss": 0.879638385772705,
|
||||
"mean_token_accuracy": 0.7650359075516462,
|
||||
"num_tokens": 68809443.0,
|
||||
"step": 520
|
||||
},
|
||||
{
|
||||
"entropy": 0.8678001549094916,
|
||||
"epoch": 0.9536125948833286,
|
||||
"grad_norm": 0.025574836879968643,
|
||||
"learning_rate": 2.5046382189239333e-07,
|
||||
"loss": 0.8517162322998046,
|
||||
"mean_token_accuracy": 0.7706384485587477,
|
||||
"num_tokens": 70130884.0,
|
||||
"step": 530
|
||||
},
|
||||
{
|
||||
"entropy": 0.8980348063632846,
|
||||
"epoch": 0.9716052853528254,
|
||||
"grad_norm": 0.02690030448138714,
|
||||
"learning_rate": 1.5769944341372915e-07,
|
||||
"loss": 0.8926727294921875,
|
||||
"mean_token_accuracy": 0.7621918022632599,
|
||||
"num_tokens": 71446103.0,
|
||||
"step": 540
|
||||
},
|
||||
{
|
||||
"entropy": 0.8809462685137988,
|
||||
"epoch": 0.9895979758223222,
|
||||
"grad_norm": 0.02480347640812397,
|
||||
"learning_rate": 6.493506493506495e-08,
|
||||
"loss": 0.8590832710266113,
|
||||
"mean_token_accuracy": 0.7687337175011635,
|
||||
"num_tokens": 72793622.0,
|
||||
"step": 550
|
||||
}
|
||||
],
|
||||
"logging_steps": 10,
|
||||
"max_steps": 556,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 1,
|
||||
"save_steps": 500,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": true
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 1.0599989240114708e+18,
|
||||
"train_batch_size": 2,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
checkpoint-556/training_args.bin
Normal file
3
checkpoint-556/training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:7238b353eb99d116ecb084fb9fed131ddd6214e745694796c3a48165bc8ba1a9
|
||||
size 6033
|
||||
Reference in New Issue
Block a user