初始化项目,由ModelHub XC社区提供模型
Model: modrill/kodcode_3_qwen3_4b_sft Source: Original Platform
This commit is contained in:
12
checkpoint-659/chat_template.jinja
Normal file
12
checkpoint-659/chat_template.jinja
Normal file
@@ -0,0 +1,12 @@
|
||||
{%- for message in messages %}
|
||||
{%- if message.role == "user" %}
|
||||
{{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }}
|
||||
{%- elif message.role == "system" %}
|
||||
{{- '<|im_start|>system\n' + message.content + '<|im_end|>\n' }}
|
||||
{%- elif message.role == "assistant" %}
|
||||
{{- '<|im_start|>assistant\n' }}{% generation %}{{ message.content }}{% endgeneration %}{{ '<|im_end|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|im_start|>assistant\n' }}
|
||||
{%- endif %}
|
||||
71
checkpoint-659/config.json
Normal file
71
checkpoint-659/config.json
Normal file
@@ -0,0 +1,71 @@
|
||||
{
|
||||
"architectures": [
|
||||
"Qwen3ForCausalLM"
|
||||
],
|
||||
"attention_bias": false,
|
||||
"attention_dropout": 0.0,
|
||||
"bos_token_id": null,
|
||||
"dtype": "bfloat16",
|
||||
"eos_token_id": 151645,
|
||||
"head_dim": 128,
|
||||
"hidden_act": "silu",
|
||||
"hidden_size": 2560,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 9728,
|
||||
"layer_types": [
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention",
|
||||
"full_attention"
|
||||
],
|
||||
"max_position_embeddings": 32768,
|
||||
"max_window_layers": 36,
|
||||
"model_type": "qwen3",
|
||||
"num_attention_heads": 32,
|
||||
"num_hidden_layers": 36,
|
||||
"num_key_value_heads": 8,
|
||||
"pad_token_id": 151643,
|
||||
"rms_norm_eps": 1e-06,
|
||||
"rope_parameters": {
|
||||
"rope_theta": 1000000,
|
||||
"rope_type": "default"
|
||||
},
|
||||
"sliding_window": null,
|
||||
"tie_word_embeddings": true,
|
||||
"transformers_version": "5.8.0",
|
||||
"use_cache": false,
|
||||
"use_sliding_window": false,
|
||||
"vocab_size": 151936
|
||||
}
|
||||
10
checkpoint-659/generation_config.json
Normal file
10
checkpoint-659/generation_config.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"do_sample": false,
|
||||
"eos_token_id": [
|
||||
151645,
|
||||
151643
|
||||
],
|
||||
"max_new_tokens": 2048,
|
||||
"pad_token_id": 151643,
|
||||
"transformers_version": "5.8.0"
|
||||
}
|
||||
3
checkpoint-659/model.safetensors
Normal file
3
checkpoint-659/model.safetensors
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b7e18e9f555d63e04b2a5d67a10939ba633cf584772c20342c840bc3158f275c
|
||||
size 8044982080
|
||||
3
checkpoint-659/optimizer.pt
Normal file
3
checkpoint-659/optimizer.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:71c2839680bde13ef34d634ec2c6bbbe6f84f6c1edb7b687385a7647accfd8ce
|
||||
size 16090225449
|
||||
3
checkpoint-659/rng_state_0.pth
Normal file
3
checkpoint-659/rng_state_0.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:659b1cdee2219458dd84ce6a632a595465680b8080e5c44bd600ff97eca8d752
|
||||
size 15429
|
||||
3
checkpoint-659/rng_state_1.pth
Normal file
3
checkpoint-659/rng_state_1.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:86accf27064cdd503053e90476a6bd10de333d4ff0594535ad55ea13a473c91d
|
||||
size 15429
|
||||
3
checkpoint-659/rng_state_2.pth
Normal file
3
checkpoint-659/rng_state_2.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:18ca8d714ef40be035404c1957b5a4dee84e1f43980408393f8aa710552ee6f6
|
||||
size 15429
|
||||
3
checkpoint-659/rng_state_3.pth
Normal file
3
checkpoint-659/rng_state_3.pth
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2cfdebe99e40accc9c9d8f09c63136a14abda997d9b501969ec8e16e9d183179
|
||||
size 15429
|
||||
3
checkpoint-659/scheduler.pt
Normal file
3
checkpoint-659/scheduler.pt
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:638f76f10b8122f6b6d00ef579bef156aea843a74d8ad66f5d19ea5b06be426f
|
||||
size 1465
|
||||
3
checkpoint-659/tokenizer.json
Normal file
3
checkpoint-659/tokenizer.json
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
|
||||
size 11422650
|
||||
30
checkpoint-659/tokenizer_config.json
Normal file
30
checkpoint-659/tokenizer_config.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"add_prefix_space": false,
|
||||
"backend": "tokenizers",
|
||||
"bos_token": null,
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|im_end|>",
|
||||
"errors": "replace",
|
||||
"extra_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>",
|
||||
"<|object_ref_start|>",
|
||||
"<|object_ref_end|>",
|
||||
"<|box_start|>",
|
||||
"<|box_end|>",
|
||||
"<|quad_start|>",
|
||||
"<|quad_end|>",
|
||||
"<|vision_start|>",
|
||||
"<|vision_end|>",
|
||||
"<|vision_pad|>",
|
||||
"<|image_pad|>",
|
||||
"<|video_pad|>"
|
||||
],
|
||||
"is_local": false,
|
||||
"local_files_only": false,
|
||||
"model_max_length": 131072,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "Qwen2Tokenizer",
|
||||
"unk_token": null
|
||||
}
|
||||
684
checkpoint-659/trainer_state.json
Normal file
684
checkpoint-659/trainer_state.json
Normal file
@@ -0,0 +1,684 @@
|
||||
{
|
||||
"best_global_step": null,
|
||||
"best_metric": null,
|
||||
"best_model_checkpoint": null,
|
||||
"epoch": 1.0,
|
||||
"eval_steps": 500,
|
||||
"global_step": 659,
|
||||
"is_hyper_param_search": false,
|
||||
"is_local_process_zero": true,
|
||||
"is_world_process_zero": true,
|
||||
"log_history": [
|
||||
{
|
||||
"entropy": 0.3487173642963171,
|
||||
"epoch": 0.015186028853454821,
|
||||
"grad_norm": 1.609375,
|
||||
"learning_rate": 2.7272727272727272e-06,
|
||||
"loss": 0.4099268913269043,
|
||||
"mean_token_accuracy": 0.8717762351036071,
|
||||
"num_tokens": 568708.0,
|
||||
"step": 10
|
||||
},
|
||||
{
|
||||
"entropy": 0.37818768359720706,
|
||||
"epoch": 0.030372057706909643,
|
||||
"grad_norm": 0.87890625,
|
||||
"learning_rate": 5.7575757575757586e-06,
|
||||
"loss": 0.39949469566345214,
|
||||
"mean_token_accuracy": 0.8737476468086243,
|
||||
"num_tokens": 1125639.0,
|
||||
"step": 20
|
||||
},
|
||||
{
|
||||
"entropy": 0.40618006475269797,
|
||||
"epoch": 0.04555808656036447,
|
||||
"grad_norm": 0.70703125,
|
||||
"learning_rate": 8.787878787878788e-06,
|
||||
"loss": 0.3975033760070801,
|
||||
"mean_token_accuracy": 0.8727035835385323,
|
||||
"num_tokens": 1683225.0,
|
||||
"step": 30
|
||||
},
|
||||
{
|
||||
"entropy": 0.3896496780216694,
|
||||
"epoch": 0.060744115413819286,
|
||||
"grad_norm": 0.6953125,
|
||||
"learning_rate": 9.997733473639876e-06,
|
||||
"loss": 0.3925030708312988,
|
||||
"mean_token_accuracy": 0.8742863699793816,
|
||||
"num_tokens": 2236895.0,
|
||||
"step": 40
|
||||
},
|
||||
{
|
||||
"entropy": 0.37322904225438835,
|
||||
"epoch": 0.07593014426727411,
|
||||
"grad_norm": 0.68359375,
|
||||
"learning_rate": 9.983889919973586e-06,
|
||||
"loss": 0.3752753257751465,
|
||||
"mean_token_accuracy": 0.8792506881058216,
|
||||
"num_tokens": 2818707.0,
|
||||
"step": 50
|
||||
},
|
||||
{
|
||||
"entropy": 0.3811411205679178,
|
||||
"epoch": 0.09111617312072894,
|
||||
"grad_norm": 0.66796875,
|
||||
"learning_rate": 9.957496810072027e-06,
|
||||
"loss": 0.38604438304901123,
|
||||
"mean_token_accuracy": 0.8750339619815349,
|
||||
"num_tokens": 3351348.0,
|
||||
"step": 60
|
||||
},
|
||||
{
|
||||
"entropy": 0.3796327030286193,
|
||||
"epoch": 0.10630220197418375,
|
||||
"grad_norm": 0.66015625,
|
||||
"learning_rate": 9.918620602428916e-06,
|
||||
"loss": 0.37710745334625245,
|
||||
"mean_token_accuracy": 0.8776259452104569,
|
||||
"num_tokens": 3915545.0,
|
||||
"step": 70
|
||||
},
|
||||
{
|
||||
"entropy": 0.37812459245324137,
|
||||
"epoch": 0.12148823082763857,
|
||||
"grad_norm": 0.64453125,
|
||||
"learning_rate": 9.867359188282193e-06,
|
||||
"loss": 0.38009963035583494,
|
||||
"mean_token_accuracy": 0.8783061921596527,
|
||||
"num_tokens": 4462906.0,
|
||||
"step": 80
|
||||
},
|
||||
{
|
||||
"entropy": 0.3751340739428997,
|
||||
"epoch": 0.1366742596810934,
|
||||
"grad_norm": 0.6640625,
|
||||
"learning_rate": 9.803841645121505e-06,
|
||||
"loss": 0.37636594772338866,
|
||||
"mean_token_accuracy": 0.8778362341225148,
|
||||
"num_tokens": 5029003.0,
|
||||
"step": 90
|
||||
},
|
||||
{
|
||||
"entropy": 0.37551863975822924,
|
||||
"epoch": 0.15186028853454822,
|
||||
"grad_norm": 0.6640625,
|
||||
"learning_rate": 9.728227911667934e-06,
|
||||
"loss": 0.3773549795150757,
|
||||
"mean_token_accuracy": 0.8772883579134941,
|
||||
"num_tokens": 5596042.0,
|
||||
"step": 100
|
||||
},
|
||||
{
|
||||
"entropy": 0.3809764288365841,
|
||||
"epoch": 0.16704631738800305,
|
||||
"grad_norm": 0.71484375,
|
||||
"learning_rate": 9.640708385144403e-06,
|
||||
"loss": 0.3807323932647705,
|
||||
"mean_token_accuracy": 0.8774459846317768,
|
||||
"num_tokens": 6144821.0,
|
||||
"step": 110
|
||||
},
|
||||
{
|
||||
"entropy": 0.37467240951955316,
|
||||
"epoch": 0.18223234624145787,
|
||||
"grad_norm": 0.62890625,
|
||||
"learning_rate": 9.541503441850844e-06,
|
||||
"loss": 0.37542564868927003,
|
||||
"mean_token_accuracy": 0.8782215595245362,
|
||||
"num_tokens": 6691491.0,
|
||||
"step": 120
|
||||
},
|
||||
{
|
||||
"entropy": 0.3787713166326284,
|
||||
"epoch": 0.19741837509491267,
|
||||
"grad_norm": 0.7109375,
|
||||
"learning_rate": 9.430862882251279e-06,
|
||||
"loss": 0.37993783950805665,
|
||||
"mean_token_accuracy": 0.8774278596043587,
|
||||
"num_tokens": 7247335.0,
|
||||
"step": 130
|
||||
},
|
||||
{
|
||||
"entropy": 0.3862619888037443,
|
||||
"epoch": 0.2126044039483675,
|
||||
"grad_norm": 0.71484375,
|
||||
"learning_rate": 9.309065301970193e-06,
|
||||
"loss": 0.38727219104766847,
|
||||
"mean_token_accuracy": 0.8749251998960972,
|
||||
"num_tokens": 7808664.0,
|
||||
"step": 140
|
||||
},
|
||||
{
|
||||
"entropy": 0.3778634283691645,
|
||||
"epoch": 0.22779043280182232,
|
||||
"grad_norm": 0.71484375,
|
||||
"learning_rate": 9.176417390281944e-06,
|
||||
"loss": 0.38028583526611326,
|
||||
"mean_token_accuracy": 0.8772468723356723,
|
||||
"num_tokens": 8360893.0,
|
||||
"step": 150
|
||||
},
|
||||
{
|
||||
"entropy": 0.3749677825719118,
|
||||
"epoch": 0.24297646165527714,
|
||||
"grad_norm": 0.69921875,
|
||||
"learning_rate": 9.033253157859715e-06,
|
||||
"loss": 0.37344467639923096,
|
||||
"mean_token_accuracy": 0.8786589197814465,
|
||||
"num_tokens": 8905139.0,
|
||||
"step": 160
|
||||
},
|
||||
{
|
||||
"entropy": 0.37994367331266404,
|
||||
"epoch": 0.25816249050873197,
|
||||
"grad_norm": 0.69140625,
|
||||
"learning_rate": 8.879933095728485e-06,
|
||||
"loss": 0.38379650115966796,
|
||||
"mean_token_accuracy": 0.8768095754086971,
|
||||
"num_tokens": 9467791.0,
|
||||
"step": 170
|
||||
},
|
||||
{
|
||||
"entropy": 0.3774459037929773,
|
||||
"epoch": 0.2733485193621868,
|
||||
"grad_norm": 0.7109375,
|
||||
"learning_rate": 8.716843267539868e-06,
|
||||
"loss": 0.3767258644104004,
|
||||
"mean_token_accuracy": 0.8779186218976974,
|
||||
"num_tokens": 10013526.0,
|
||||
"step": 180
|
||||
},
|
||||
{
|
||||
"entropy": 0.3706828704103827,
|
||||
"epoch": 0.2885345482156416,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 8.544394337454409e-06,
|
||||
"loss": 0.373125958442688,
|
||||
"mean_token_accuracy": 0.8792334951460361,
|
||||
"num_tokens": 10567209.0,
|
||||
"step": 190
|
||||
},
|
||||
{
|
||||
"entropy": 0.3781122103333473,
|
||||
"epoch": 0.30372057706909644,
|
||||
"grad_norm": 0.7109375,
|
||||
"learning_rate": 8.36302053607924e-06,
|
||||
"loss": 0.3779691457748413,
|
||||
"mean_token_accuracy": 0.877835976332426,
|
||||
"num_tokens": 11121802.0,
|
||||
"step": 200
|
||||
},
|
||||
{
|
||||
"entropy": 0.37852676026523113,
|
||||
"epoch": 0.31890660592255127,
|
||||
"grad_norm": 0.703125,
|
||||
"learning_rate": 8.17317856706482e-06,
|
||||
"loss": 0.37905910015106203,
|
||||
"mean_token_accuracy": 0.877592646330595,
|
||||
"num_tokens": 11677885.0,
|
||||
"step": 210
|
||||
},
|
||||
{
|
||||
"entropy": 0.376834512129426,
|
||||
"epoch": 0.3340926347760061,
|
||||
"grad_norm": 0.66015625,
|
||||
"learning_rate": 7.975346457114034e-06,
|
||||
"loss": 0.3753563404083252,
|
||||
"mean_token_accuracy": 0.8776590585708618,
|
||||
"num_tokens": 12235216.0,
|
||||
"step": 220
|
||||
},
|
||||
{
|
||||
"entropy": 0.3737819105386734,
|
||||
"epoch": 0.3492786636294609,
|
||||
"grad_norm": 0.671875,
|
||||
"learning_rate": 7.770022352299294e-06,
|
||||
"loss": 0.37358593940734863,
|
||||
"mean_token_accuracy": 0.878921328485012,
|
||||
"num_tokens": 12787759.0,
|
||||
"step": 230
|
||||
},
|
||||
{
|
||||
"entropy": 0.3769740372896194,
|
||||
"epoch": 0.36446469248291574,
|
||||
"grad_norm": 0.73046875,
|
||||
"learning_rate": 7.557723263718596e-06,
|
||||
"loss": 0.37995898723602295,
|
||||
"mean_token_accuracy": 0.8769361607730388,
|
||||
"num_tokens": 13346471.0,
|
||||
"step": 240
|
||||
},
|
||||
{
|
||||
"entropy": 0.3878506176173687,
|
||||
"epoch": 0.37965072133637057,
|
||||
"grad_norm": 0.70703125,
|
||||
"learning_rate": 7.338983765648985e-06,
|
||||
"loss": 0.38782215118408203,
|
||||
"mean_token_accuracy": 0.8749015353620052,
|
||||
"num_tokens": 13895194.0,
|
||||
"step": 250
|
||||
},
|
||||
{
|
||||
"entropy": 0.37555828876793385,
|
||||
"epoch": 0.39483675018982534,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 7.114354649475499e-06,
|
||||
"loss": 0.3771331787109375,
|
||||
"mean_token_accuracy": 0.878202386945486,
|
||||
"num_tokens": 14453542.0,
|
||||
"step": 260
|
||||
},
|
||||
{
|
||||
"entropy": 0.37179951313883064,
|
||||
"epoch": 0.41002277904328016,
|
||||
"grad_norm": 0.6484375,
|
||||
"learning_rate": 6.884401536785045e-06,
|
||||
"loss": 0.37206058502197265,
|
||||
"mean_token_accuracy": 0.8789021499454975,
|
||||
"num_tokens": 15016280.0,
|
||||
"step": 270
|
||||
},
|
||||
{
|
||||
"entropy": 0.3706459369510412,
|
||||
"epoch": 0.425208807896735,
|
||||
"grad_norm": 0.7109375,
|
||||
"learning_rate": 6.6497034551174585e-06,
|
||||
"loss": 0.37101426124572756,
|
||||
"mean_token_accuracy": 0.8798882246017456,
|
||||
"num_tokens": 15561057.0,
|
||||
"step": 280
|
||||
},
|
||||
{
|
||||
"entropy": 0.37728526555001735,
|
||||
"epoch": 0.4403948367501898,
|
||||
"grad_norm": 0.65625,
|
||||
"learning_rate": 6.41085137996006e-06,
|
||||
"loss": 0.37785754203796384,
|
||||
"mean_token_accuracy": 0.8779311388731003,
|
||||
"num_tokens": 16127699.0,
|
||||
"step": 290
|
||||
},
|
||||
{
|
||||
"entropy": 0.3816283464431763,
|
||||
"epoch": 0.45558086560364464,
|
||||
"grad_norm": 0.81640625,
|
||||
"learning_rate": 6.168446746656973e-06,
|
||||
"loss": 0.3794879674911499,
|
||||
"mean_token_accuracy": 0.8773063771426678,
|
||||
"num_tokens": 16686457.0,
|
||||
"step": 300
|
||||
},
|
||||
{
|
||||
"entropy": 0.37517447732388975,
|
||||
"epoch": 0.47076689445709946,
|
||||
"grad_norm": 0.70703125,
|
||||
"learning_rate": 5.923099935980278e-06,
|
||||
"loss": 0.3782352924346924,
|
||||
"mean_token_accuracy": 0.8787827685475349,
|
||||
"num_tokens": 17254272.0,
|
||||
"step": 310
|
||||
},
|
||||
{
|
||||
"entropy": 0.374018133059144,
|
||||
"epoch": 0.4859529233105543,
|
||||
"grad_norm": 0.71484375,
|
||||
"learning_rate": 5.675428737176367e-06,
|
||||
"loss": 0.37341156005859377,
|
||||
"mean_token_accuracy": 0.8788688823580741,
|
||||
"num_tokens": 17809900.0,
|
||||
"step": 320
|
||||
},
|
||||
{
|
||||
"entropy": 0.3753270395100117,
|
||||
"epoch": 0.5011389521640092,
|
||||
"grad_norm": 0.68359375,
|
||||
"learning_rate": 5.426056792357552e-06,
|
||||
"loss": 0.3752497673034668,
|
||||
"mean_token_accuracy": 0.8784179173409938,
|
||||
"num_tokens": 18379566.0,
|
||||
"step": 330
|
||||
},
|
||||
{
|
||||
"entropy": 0.3742110010236502,
|
||||
"epoch": 0.5163249810174639,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 5.175612026156045e-06,
|
||||
"loss": 0.3746063232421875,
|
||||
"mean_token_accuracy": 0.8782069273293018,
|
||||
"num_tokens": 18943281.0,
|
||||
"step": 340
|
||||
},
|
||||
{
|
||||
"entropy": 0.37444472052156924,
|
||||
"epoch": 0.5315110098709187,
|
||||
"grad_norm": 0.71484375,
|
||||
"learning_rate": 4.924725064594448e-06,
|
||||
"loss": 0.3729024171829224,
|
||||
"mean_token_accuracy": 0.8787923693656922,
|
||||
"num_tokens": 19488865.0,
|
||||
"step": 350
|
||||
},
|
||||
{
|
||||
"entropy": 0.3750518877059221,
|
||||
"epoch": 0.5466970387243736,
|
||||
"grad_norm": 0.78515625,
|
||||
"learning_rate": 4.674027647154037e-06,
|
||||
"loss": 0.3758077621459961,
|
||||
"mean_token_accuracy": 0.8765743866562843,
|
||||
"num_tokens": 20048281.0,
|
||||
"step": 360
|
||||
},
|
||||
{
|
||||
"entropy": 0.3787516813725233,
|
||||
"epoch": 0.5618830675778284,
|
||||
"grad_norm": 0.74609375,
|
||||
"learning_rate": 4.424151036039381e-06,
|
||||
"loss": 0.3790909767150879,
|
||||
"mean_token_accuracy": 0.8769759923219681,
|
||||
"num_tokens": 20597434.0,
|
||||
"step": 370
|
||||
},
|
||||
{
|
||||
"entropy": 0.3786877432838082,
|
||||
"epoch": 0.5770690964312832,
|
||||
"grad_norm": 0.68359375,
|
||||
"learning_rate": 4.175724426644724e-06,
|
||||
"loss": 0.3812232971191406,
|
||||
"mean_token_accuracy": 0.8777030549943448,
|
||||
"num_tokens": 21161267.0,
|
||||
"step": 380
|
||||
},
|
||||
{
|
||||
"entropy": 0.37311795353889465,
|
||||
"epoch": 0.592255125284738,
|
||||
"grad_norm": 0.6796875,
|
||||
"learning_rate": 3.929373363224654e-06,
|
||||
"loss": 0.3731100559234619,
|
||||
"mean_token_accuracy": 0.8793233536183834,
|
||||
"num_tokens": 21709421.0,
|
||||
"step": 390
|
||||
},
|
||||
{
|
||||
"entropy": 0.3734915753826499,
|
||||
"epoch": 0.6074411541381929,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 3.685718163758427e-06,
|
||||
"loss": 0.37124335765838623,
|
||||
"mean_token_accuracy": 0.8786324210464954,
|
||||
"num_tokens": 22250023.0,
|
||||
"step": 400
|
||||
},
|
||||
{
|
||||
"entropy": 0.3722789943218231,
|
||||
"epoch": 0.6226271829916477,
|
||||
"grad_norm": 0.67578125,
|
||||
"learning_rate": 3.445372357974194e-06,
|
||||
"loss": 0.37429609298706057,
|
||||
"mean_token_accuracy": 0.8784996062517166,
|
||||
"num_tokens": 22802881.0,
|
||||
"step": 410
|
||||
},
|
||||
{
|
||||
"entropy": 0.3826644644141197,
|
||||
"epoch": 0.6378132118451025,
|
||||
"grad_norm": 0.65625,
|
||||
"learning_rate": 3.2089411424661864e-06,
|
||||
"loss": 0.3828511953353882,
|
||||
"mean_token_accuracy": 0.875868634134531,
|
||||
"num_tokens": 23368508.0,
|
||||
"step": 420
|
||||
},
|
||||
{
|
||||
"entropy": 0.36304581388831136,
|
||||
"epoch": 0.6529992406985573,
|
||||
"grad_norm": 0.703125,
|
||||
"learning_rate": 2.977019856794955e-06,
|
||||
"loss": 0.362534499168396,
|
||||
"mean_token_accuracy": 0.8821237675845623,
|
||||
"num_tokens": 23923709.0,
|
||||
"step": 430
|
||||
},
|
||||
{
|
||||
"entropy": 0.38759873658418653,
|
||||
"epoch": 0.6681852695520122,
|
||||
"grad_norm": 0.67578125,
|
||||
"learning_rate": 2.7501924844078538e-06,
|
||||
"loss": 0.38718571662902834,
|
||||
"mean_token_accuracy": 0.8746685221791267,
|
||||
"num_tokens": 24477925.0,
|
||||
"step": 440
|
||||
},
|
||||
{
|
||||
"entropy": 0.3708066754043102,
|
||||
"epoch": 0.683371298405467,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 2.5290301821544826e-06,
|
||||
"loss": 0.36970815658569334,
|
||||
"mean_token_accuracy": 0.8802599847316742,
|
||||
"num_tokens": 25027245.0,
|
||||
"step": 450
|
||||
},
|
||||
{
|
||||
"entropy": 0.36688214987516404,
|
||||
"epoch": 0.6985573272589218,
|
||||
"grad_norm": 0.68359375,
|
||||
"learning_rate": 2.3140898420998425e-06,
|
||||
"loss": 0.3657586097717285,
|
||||
"mean_token_accuracy": 0.8809232294559479,
|
||||
"num_tokens": 25582534.0,
|
||||
"step": 460
|
||||
},
|
||||
{
|
||||
"entropy": 0.36983687337487936,
|
||||
"epoch": 0.7137433561123766,
|
||||
"grad_norm": 0.7109375,
|
||||
"learning_rate": 2.105912689256533e-06,
|
||||
"loss": 0.37239837646484375,
|
||||
"mean_token_accuracy": 0.8794391065835953,
|
||||
"num_tokens": 26134875.0,
|
||||
"step": 470
|
||||
},
|
||||
{
|
||||
"entropy": 0.3732773784548044,
|
||||
"epoch": 0.7289293849658315,
|
||||
"grad_norm": 0.63671875,
|
||||
"learning_rate": 1.905022918766995e-06,
|
||||
"loss": 0.37306258678436277,
|
||||
"mean_token_accuracy": 0.8793606124818325,
|
||||
"num_tokens": 26681084.0,
|
||||
"step": 480
|
||||
},
|
||||
{
|
||||
"entropy": 0.3710829207673669,
|
||||
"epoch": 0.7441154138192863,
|
||||
"grad_norm": 0.69921875,
|
||||
"learning_rate": 1.7119263759673677e-06,
|
||||
"loss": 0.3711911678314209,
|
||||
"mean_token_accuracy": 0.8803343921899796,
|
||||
"num_tokens": 27234332.0,
|
||||
"step": 490
|
||||
},
|
||||
{
|
||||
"entropy": 0.3832434043288231,
|
||||
"epoch": 0.7593014426727411,
|
||||
"grad_norm": 0.9921875,
|
||||
"learning_rate": 1.5271092826566108e-06,
|
||||
"loss": 0.3841698169708252,
|
||||
"mean_token_accuracy": 0.8759766638278961,
|
||||
"num_tokens": 27794602.0,
|
||||
"step": 500
|
||||
},
|
||||
{
|
||||
"entropy": 0.38211295008659363,
|
||||
"epoch": 0.7744874715261959,
|
||||
"grad_norm": 0.8046875,
|
||||
"learning_rate": 1.3510370127781635e-06,
|
||||
"loss": 0.3804590940475464,
|
||||
"mean_token_accuracy": 0.8769169762730599,
|
||||
"num_tokens": 28354831.0,
|
||||
"step": 510
|
||||
},
|
||||
{
|
||||
"entropy": 0.3752284612506628,
|
||||
"epoch": 0.7896735003796507,
|
||||
"grad_norm": 0.73046875,
|
||||
"learning_rate": 1.1841529205970281e-06,
|
||||
"loss": 0.37546916007995607,
|
||||
"mean_token_accuracy": 0.8786770381033421,
|
||||
"num_tokens": 28922852.0,
|
||||
"step": 520
|
||||
},
|
||||
{
|
||||
"entropy": 0.36873019095510245,
|
||||
"epoch": 0.8048595292331056,
|
||||
"grad_norm": 0.6640625,
|
||||
"learning_rate": 1.026877224322923e-06,
|
||||
"loss": 0.36797375679016114,
|
||||
"mean_token_accuracy": 0.880731363594532,
|
||||
"num_tokens": 29493442.0,
|
||||
"step": 530
|
||||
},
|
||||
{
|
||||
"entropy": 0.3788988694548607,
|
||||
"epoch": 0.8200455580865603,
|
||||
"grad_norm": 0.69921875,
|
||||
"learning_rate": 8.7960594799059e-07,
|
||||
"loss": 0.37884984016418455,
|
||||
"mean_token_accuracy": 0.8770281121134758,
|
||||
"num_tokens": 30034443.0,
|
||||
"step": 540
|
||||
},
|
||||
{
|
||||
"entropy": 0.3814096964895725,
|
||||
"epoch": 0.8352315869400152,
|
||||
"grad_norm": 0.73046875,
|
||||
"learning_rate": 7.427099242616348e-07,
|
||||
"loss": 0.3821078300476074,
|
||||
"mean_token_accuracy": 0.8763406798243523,
|
||||
"num_tokens": 30570567.0,
|
||||
"step": 550
|
||||
},
|
||||
{
|
||||
"entropy": 0.3758743409067392,
|
||||
"epoch": 0.85041761579347,
|
||||
"grad_norm": 0.6953125,
|
||||
"learning_rate": 6.165338606588517e-07,
|
||||
"loss": 0.3744307279586792,
|
||||
"mean_token_accuracy": 0.8794760994613171,
|
||||
"num_tokens": 31129457.0,
|
||||
"step": 560
|
||||
},
|
||||
{
|
||||
"entropy": 0.37431446108967065,
|
||||
"epoch": 0.8656036446469249,
|
||||
"grad_norm": 0.6640625,
|
||||
"learning_rate": 5.0139547158427e-07,
|
||||
"loss": 0.37335963249206544,
|
||||
"mean_token_accuracy": 0.8793582506477833,
|
||||
"num_tokens": 31689902.0,
|
||||
"step": 570
|
||||
},
|
||||
{
|
||||
"entropy": 0.38372623883187773,
|
||||
"epoch": 0.8807896735003796,
|
||||
"grad_norm": 0.7578125,
|
||||
"learning_rate": 3.9758467830656623e-07,
|
||||
"loss": 0.38321547508239745,
|
||||
"mean_token_accuracy": 0.8755873307585716,
|
||||
"num_tokens": 32253359.0,
|
||||
"step": 580
|
||||
},
|
||||
{
|
||||
"entropy": 0.36962624490261076,
|
||||
"epoch": 0.8959757023538345,
|
||||
"grad_norm": 0.72265625,
|
||||
"learning_rate": 3.0536287893223603e-07,
|
||||
"loss": 0.37100839614868164,
|
||||
"mean_token_accuracy": 0.8799250744283199,
|
||||
"num_tokens": 32813428.0,
|
||||
"step": 590
|
||||
},
|
||||
{
|
||||
"entropy": 0.3853254303336143,
|
||||
"epoch": 0.9111617312072893,
|
||||
"grad_norm": 0.6640625,
|
||||
"learning_rate": 2.2496229019879635e-07,
|
||||
"loss": 0.3848439693450928,
|
||||
"mean_token_accuracy": 0.8762004837393761,
|
||||
"num_tokens": 33382024.0,
|
||||
"step": 600
|
||||
},
|
||||
{
|
||||
"entropy": 0.3742272950708866,
|
||||
"epoch": 0.9263477600607442,
|
||||
"grad_norm": 0.671875,
|
||||
"learning_rate": 1.5658536274738623e-07,
|
||||
"loss": 0.3725078582763672,
|
||||
"mean_token_accuracy": 0.8787065915763378,
|
||||
"num_tokens": 33939521.0,
|
||||
"step": 610
|
||||
},
|
||||
{
|
||||
"entropy": 0.3774934906512499,
|
||||
"epoch": 0.9415337889141989,
|
||||
"grad_norm": 0.69140625,
|
||||
"learning_rate": 1.004042713471165e-07,
|
||||
"loss": 0.37858588695526124,
|
||||
"mean_token_accuracy": 0.877676124125719,
|
||||
"num_tokens": 34482539.0,
|
||||
"step": 620
|
||||
},
|
||||
{
|
||||
"entropy": 0.37360552567988636,
|
||||
"epoch": 0.9567198177676538,
|
||||
"grad_norm": 0.734375,
|
||||
"learning_rate": 5.6560481354807625e-08,
|
||||
"loss": 0.37269864082336424,
|
||||
"mean_token_accuracy": 0.8786865592002868,
|
||||
"num_tokens": 35045028.0,
|
||||
"step": 630
|
||||
},
|
||||
{
|
||||
"entropy": 0.3767994062975049,
|
||||
"epoch": 0.9719058466211086,
|
||||
"grad_norm": 0.890625,
|
||||
"learning_rate": 2.516439250177749e-08,
|
||||
"loss": 0.3758098125457764,
|
||||
"mean_token_accuracy": 0.8778494797647,
|
||||
"num_tokens": 35600399.0,
|
||||
"step": 640
|
||||
},
|
||||
{
|
||||
"entropy": 0.3786265593022108,
|
||||
"epoch": 0.9870918754745635,
|
||||
"grad_norm": 0.6875,
|
||||
"learning_rate": 6.295060904623618e-09,
|
||||
"loss": 0.378217077255249,
|
||||
"mean_token_accuracy": 0.8778206452727317,
|
||||
"num_tokens": 36162293.0,
|
||||
"step": 650
|
||||
}
|
||||
],
|
||||
"logging_steps": 10,
|
||||
"max_steps": 659,
|
||||
"num_input_tokens_seen": 0,
|
||||
"num_train_epochs": 1,
|
||||
"save_steps": 500,
|
||||
"stateful_callbacks": {
|
||||
"TrainerControl": {
|
||||
"args": {
|
||||
"should_epoch_stop": false,
|
||||
"should_evaluate": false,
|
||||
"should_log": false,
|
||||
"should_save": true,
|
||||
"should_training_stop": true
|
||||
},
|
||||
"attributes": {}
|
||||
}
|
||||
},
|
||||
"total_flos": 9.858614662906511e+17,
|
||||
"train_batch_size": 2,
|
||||
"trial_name": null,
|
||||
"trial_params": null
|
||||
}
|
||||
3
checkpoint-659/training_args.bin
Normal file
3
checkpoint-659/training_args.bin
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:514f32c7b13687591d405f4e860f5e4d9145eaaff00bbbfd04aded17ecc9774d
|
||||
size 5777
|
||||
Reference in New Issue
Block a user